In [1]:
import numpy as np
import statsmodels.api as sm

In [2]:
def gen_data(nobs, num_cov, a):
    xn = np.random.normal(scale=1., size=(nobs, num_cov))
    x0 = np.random.normal(scale=1., size=(nobs, 1))
    x0 = np.tile(x0,2)
    xn = xn + x0
    xn[:,1] = a*xn[:,1]
    e = np.random.normal(loc=0.0, scale=1.0, size=(nobs))
    yn = a + (xn[:,1]) + e
    return yn, xn

y,x = gen_data(1000,2,.25)
print(y.shape,x.shape)

(1000,) (1000, 2)


In [3]:
model0 = sm.OLS(y,sm.add_constant(x)).fit()
print('r2 true', model0.fittedvalues.var()/y.var() )

r2 true 0.09924067273827342


In [4]:
model1 = sm.OLS(y,sm.add_constant(x[:,0])).fit()
print('r2 1',model1.fittedvalues.var()/y.var())
model2 = sm.OLS(y,sm.add_constant(x[:,1])).fit()
print('r2 2',model2.fittedvalues.var()/y.var())

r2 1 0.02914122539713672
r2 2 0.09920728721855364


approach 1: regress error 1 on predicted 2

In [5]:
## THIS IS THE right way... it seems better.. 
error_reg = sm.OLS(model1.resid,sm.add_constant(model2.fittedvalues) ).fit()

#split up unexplained varation into 2 pieces... 
print(model1.resid.var())
print(error_reg.resid.var() + error_reg.fittedvalues.var())

#not quite the same? as when i do it with correlations?
print((error_reg.fittedvalues.var())/y.var()  ) #is this the additional r2 that y2 can add?
print( (model1.fittedvalues.var() + error_reg.fittedvalues.var())/y.var()  )

1.0467324081397495
1.0467324081397498
0.050676495756907414
0.07981772115404413


approach 2: covariance matrices

In [6]:
#covariance matrix approach
cov = np.cov( [y,model1.fittedvalues,model2.fittedvalues] )
print(cov)

#double check that diagnals are r2
print(1- (y - model1.fittedvalues).var()/y.var())
print(cov[1,1]/cov[0,0])

#what is the r2 of model 2
print(1- (y - model2.fittedvalues).var()/y.var())
print(cov[2,2]/cov[0,0])

[[1.07923028 0.03145009 0.10706751]
 [0.03145009 0.03145009 0.030545  ]
 [0.10706751 0.030545   0.10706751]]
0.029141225397136927
0.029141225397136712
0.09920728721855365
0.09920728721855364


In [7]:
#how much could the r2 of model 1 be-improved? what is the r2 of 1 and what is the correlation?
beta = (cov[0,2] - cov[1,2])/(cov[2,2])
print(beta)

print( (beta**2*cov[2,2])/cov[0,0]) #this is it!!

#can simplify
print( (cov[0,2] - cov[1,2])**2/cov[2,2]/cov[0,0])  #this is it!!
#TODO think this through using variance/covariance formula

#how far is model 1 from the true model, using model 2 as a yard stick... 
#distance of 1 from the truth, relative to unexplained error

0.7147127026978349
0.05067649575690736
0.050676495756907365


relationship with infomrativeness

In [8]:
## THIS IS THE right way... it seems better.. 
error_reg = sm.OLS( model1.resid,sm.add_constant(model2.fittedvalues) ).fit()

#informativesness about model 2 for model 1 residuals
print(error_reg.fittedvalues.var()/model1.resid.var())

#not quite the same? as when i do it with correlations?
print( (cov[0,2] - cov[1,2])**2/(cov[2,2]*(cov[0,0]-cov[1,1]) ) )

0.05219759771717262
0.05219759771717257


In [10]:
print( (cov[0,2] - cov[1,2])**2/(cov[0,0]*(cov[2,2]+cov[1,1]-2*cov[0,1]) ) )

0.07175339299929927
