##### Linear regression tests
This notebook examines various formulae and calcs for unweighted least-squares linear regression

In [1]:
import numpy as np
from scipy import stats

# As suggested in https://www.dataquest.io/blog/jupyter-notebook-tips-tricks-shortcuts/
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Here is a little data set from Sachs, Table 109
x = np.array((13,17,10,17,20,11,15))
y = np.array((12,17,11,13,16,14,15))

def lsfit_Sachs(x,y,iverbose=1):
    # notation per L. Sachs, 1984, Applied Statistics, Springer-Verlag, p. 417
    n = float(x.size)
    sumx = np.sum(x)
    sumy = np.sum(y)

    sumx2 = np.sum(x * x)
    sumy2 = np.sum(y * y)
    sumxy = np.sum(x * y)

    Qx = sumx2-sumx**2/n
    Qy = sumy2-sumy**2/n
    Qxy = sumxy-(1./n)*sumx*sumy

    xbar = sumx/n
    ybar = sumy/n

    b = Qxy/Qx
    a = (sumy-b*sumx)/n

    sx = np.sqrt(Qx/(n-1))
    sy = np.sqrt(Qy/(n-1))
    r = Qxy/np.sqrt(Qx*Qy)
    r2 = r*r
    Qydotx = Qy-b*Qxy
    sydotx = np.sqrt( Qydotx/(n-2) )

    # s.d. of intercept and slope
    sb = sydotx/np.sqrt(Qx)
    sa = sydotx*(np.sqrt(1./n+xbar**2/Qx))

    yhat = a+b*x
    # PTVF = Press et al., 1992, Numerical Recipes in FORTRAN, Second Edtion
    # For unweighted least-squares
    chisq = sum( (y-yhat)**2 ) #PTVF eqn. 15.2.2, but with all weights = 1
    if(iverbose):
        print('Slope: {0:.3f} +- {1:.4f}\nIntercept: {2:.3f} +- {3:.4f}\nr2 = {4:.4f}; chi-square = {5:.3f}'\
              .format(b,sb,a,sa,r2,chisq))
    return b,sb,a,sa,r2,chisq

b,sb,a,sa,r2,chisq = lsfit_Sachs(x,y)

# this is an equivilant expression for r2 that can be computed outside the fitting routine
ybar =np.mean(y)
yhat = a+b*x
r22 = np.sum((yhat-ybar)**2)/np.sum((y-ybar)**2)
print("These should match. r2: {}; r22: {}".format(r2,r22))

# this is how to calc. chi-square from r2 for unweighted regression
chisq2 = (1.-r2)*np.sum((y-ybar)**2) #PTVF 15.2.13 and 14
print("These should match, but dont. chisq: {}; chisq2: {}".format(chisq,chisq2))

# what about:
chisq3 = np.sum( (y-yhat)**2  )
chisq3

Slope: 0.426 +- 0.1897
Intercept: 7.729 +- 2.8621
r2 = 0.5023; chi-square = 13.935
These should match. r2: 0.502306273063; r22: 0.502306273063
These should match, but dont. chisq: 13.9354243542; chisq2: 13.9354243542


13.935424354243541

In [2]:
# PTVF = Press et al., 1992, Numerical Recipes in FORTRAN, Second Edtion

# this routine requires weights (e.g., array with std. dev. of y values)
# Weights of one produce unweighted least-squares
def lsfit_Press(x,y,sig,iverbose=1):   
    n = float(x.size)
    sx = np.sum(x/sig**2)
    sy = np.sum(y/sig**2)
    ss = sum(1./sig**2)
    sxoss = sx/ss
    syoss = sy/ss
    t = (1./sig)*(x-sx/ss)
    st2 = np.sum(t**2)
    b = np.sum( (t*y)/sig )/st2
    a = (sy-sx*b)/ss
    siga = np.sqrt((1.+sx**2/(ss*st2))/ss)
    sigb = np.sqrt(1./st2)
    chi2 = sum( ((y-a-b*x)/sig)**2 ) #PTVF eqn. 15.2.2
    sigdat= np.sqrt(chi2/(n-2))
    siga=siga*sigdat
    sigb=sigb*sigdat

    r2 = np.sum((yhat-syoss)**2)/np.sum((y-syoss)**2)
    if(iverbose):
        print('Slope: {0:.3f} +- {1:.4f}\nIntercept: {2:.3f} +- {3:.4f}\nr2 = {4:.4f}; chi-square = {5:.3f}'\
              .format(b,sigb,a,siga,r2,chisq))
    return b,sb,a,sa,r2,chisq
sig = np.ones_like(x)
b,sb,a,sa,r2,chisq = lsfit_Press(x,y,sig)

# this is an equivilent expression for r2 that can be computed outside the fitting routine
ybar =np.mean(y)
yhat = a+b*x
r22 = np.sum((yhat-ybar)**2)/np.sum((y-ybar)**2)
print("These should match. r2: {}; r22: {}".format(r2,r22))

# this is how to calc. chi-square from r2 for unweighted regression
chisq2 = (1.-r2)*np.sum((y-ybar)**2) #PTVF 15.2.13 and 14
print("These should match. chisq: {}; chisq2: {}".format(chisq,chisq2))


Slope: 0.426 +- 0.1897
Intercept: 7.729 +- 2.8621
r2 = 0.5023; chi-square = 13.935
These should match. r2: 0.502306273063; r22: 0.502306273063
These should match. chisq: 13.9354243542; chisq2: 13.9354243542


In [3]:
# This is the Scipy version. It should agree, but it does not return uncertainty for the intercept.
slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
print('Slope: {0:.3f} +- {1:.4f}\nIntercept: {2:.3f} +- {3:.4f}\nr2 = {4:.4f}'.format(slope,std_err,intercept,0,r_value**2))


Slope: 0.426 +- 0.1897
Intercept: 7.729 +- 0.0000
r2 = 0.5023
