In [1]:
import numpy as np
from scipy.optimize import minimize
import statsmodels.api as sm
from scipy.stats import norm

In [2]:
def gen_data(nobs=1000, a=2, b=1, c=2, d=1):
    #true value
    y = np.random.normal(loc=0, scale=1, size=(nobs,1))
    
    #measurement error terms
    e1 = np.random.normal(loc=0, scale=1, size=(nobs,1))
    e2 = np.random.normal(loc=0, scale=1, size=(nobs,1))
    
    #procedure
    y1 =b+ a*y + e1
    y2 =d+ c*y + e2
    return y1,y2,y,nobs

y1n,y2n,y,n = gen_data()


print(y1n.mean())
print(y2n.mean())
print(y1n.var())
print(y2n.var())

1.0165722743927872
1.0186454132611855
5.2784622885286385
5.170315164928033


In [3]:
def estimate(y1n,y2n):
    b = y1n.mean()
    d = y2n.mean()
    a = np.sqrt(y1n.var() - 1)
    c = np.sqrt(y2n.var() - 1)
    sign = ((y1n*y2n).mean() - b*d) > 0
    if not sign:
        c = -1*c
    return a,b,c,d

for i in [(3,1,2,4),(2,4,-5,1),(2,4,5,1)]:
    y1n,y2n,y,n = gen_data(a=i[0],b=i[1],c=i[2],d=i[3])
    print(estimate(y1n,y2n))

(2.78194677580385, 1.0719628892074562, 1.9019601326046065, 4.101341335502296)
(1.9906310855502916, 3.997245702490851, -4.957689734013365, 0.8940258084481163)
(1.8871531196438374, 4.030457811966271, 4.573551238942853, 0.9811389866872948)


In [4]:
#impute value of y*
##compare just averaging y1+y2 with this.....

def conditional_exp1(x,y,a,b,c,d):
    """numerically integrate things"""
    grid_size = 1000
    z_range = 100
    z_grid = np.linspace(-z_range/2,
                z_range/2, grid_size)

    method1 = (z_range/grid_size)*z_grid*norm.pdf(y-(a*z_grid+b))*norm.pdf(x-(c*z_grid+d))
    method1 = method1.sum()
    return method1


def conditional_exp2(x,y,a,b,c,d):
    """i got this from wolfram"""
    method2 = np.exp(-(a*(x-d) - c*(y-b))**2/(2*(a**2+c**2)))*(a*(y-b) + c*(x-d))/(np.sqrt(2*np.pi)*(a**2+c**2)**(3/2))
    return  method2
    
    
for j in ([0,0],[0,1],[1,0],[1,1],[2,3],[5,7]):
    a,b,c,d = 5,1,4,1
    print(conditional_exp1(j[0],j[1],a,b,c,d),conditional_exp2(j[0],j[1],a,b,c,d))

-0.013497272132567866 -0.013510782915483343
-0.004476647577573008 -0.004481128706279284
-0.006244956054242419 -0.006251207261503918
-3.2526065174565133e-19 0.0
0.019044138757173486 0.019063201959132615
0.057453595699030253 0.05751110680583605


In [5]:
def estimate_predict(y1n,y2n):
    a,b,c,d = estimate(y1n,y2n)
    pred = conditional_exp2(y1n,y2n,a,b,c,d)
    return pred 

def mse(x,y):
    return ((x-y)**2).mean()

y1n,y2n,y,n = gen_data()
y_pred = estimate_predict(y1n,y2n)
result = np.concatenate( (y,(y1n+y2n)/2,  estimate_predict(y1n,y2n) ) ,axis=1)
print(result[0:10,:])
print(mse(y,(y1n+y2n)/2),mse(y,estimate_predict(y1n,y2n)))

#basically, its just (x+y)/2 ... but down weight towards average... the more x and y differ...
#who has estiamted this model?

[[ 0.80077059  2.32610783  0.05826282]
 [-1.22663466 -1.96806658 -0.19422092]
 [-0.36802172  0.81390136 -0.01436219]
 [ 0.38144356  2.06388893  0.00974016]
 [ 1.79940323  4.31466225  0.1953541 ]
 [ 1.15373846  3.08024423  0.13154365]
 [-0.06810145  0.32357966 -0.03178163]
 [ 0.42857721  1.7169856   0.04197651]
 [ 0.08148326  2.51348959  0.06514237]
 [-2.10887259 -3.94063766 -0.21402235]]
2.5884428894301497 0.9422658504632397
