In [1]:
import pandas as pd
y = [10,17,16,15,22,31,8,45]
x = [2,3,4,5,6,7,8,9]
y = [12,14.4,14.6,16,11.3,10,16.2,10.4,13.1,11.3]
x = [12,21,18,22,13,10,23,11,16,14]
df = pd.DataFrame({"Y":y,"X":x})
df.index+=1
df.T

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
Y,12.0,14.4,14.6,16.0,11.3,10.0,16.2,10.4,13.1,11.3
X,12.0,21.0,18.0,22.0,13.0,10.0,23.0,11.0,16.0,14.0


### Least Squares Method

In [2]:
import matplotlib.pyplot as plt
import numpy as np

y = np.matrix(y).T #dependent variable 
x1 = x #independent variable

x = np.matrix([np.ones(len(y)),x1]).T
xx = np.dot(x.T,x)
xinv = np.linalg.inv(xx) #(x'x)-1
xy = np.dot(x.T,y)  #x'y
beta = xinv*xy

b0 = np.round(beta[0].item(0),4)
b1 = np.round(beta[1].item(0),4)

print(f"x\n{x}\n\nx'\n{x.T}\n\nx'x\n{xx}\n\ny\n{y}\n\nx'y\n{xy}\n\n(x'x)^-1\n{xinv}\n\n((x'x)^-1)*(x'y)\n{beta}",
f"\n\nY predicted model = {b0} + {b1} * x1 ")

predicts = list(map(lambda x:b0+b1*x,x1))
predicts = np.array(np.round(predicts,4))
print("\nPredicted values = ",predicts)

err = np.zeros(len(y))
n = len(y)

for i in range(len(y)):
    err[i]=y[i]-predicts[i]
    
print("\nErrors = ",err)


x
[[ 1. 12.]
 [ 1. 21.]
 [ 1. 18.]
 [ 1. 22.]
 [ 1. 13.]
 [ 1. 10.]
 [ 1. 23.]
 [ 1. 11.]
 [ 1. 16.]
 [ 1. 14.]]

x'
[[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]
 [12. 21. 18. 22. 13. 10. 23. 11. 16. 14.]]

x'x
[[  10.  160.]
 [ 160. 2764.]]

y
[[12. ]
 [14.4]
 [14.6]
 [16. ]
 [11.3]
 [10. ]
 [16.2]
 [10.4]
 [13.1]
 [11.3]]

x'y
[[ 129.3]
 [2162.9]]

(x'x)^-1
[[ 1.35490196 -0.07843137]
 [-0.07843137  0.00490196]]

((x'x)^-1)*(x'y)
[[5.54960784]
 [0.46127451]] 

Y predicted model = 5.5496 + 0.4613 * x1 

Predicted values =  [11.0852 15.2369 13.853  15.6982 11.5465 10.1626 16.1595 10.6239 12.9304
 12.0078]

Errors =  [ 0.9148 -0.8369  0.747   0.3018 -0.2465 -0.1626  0.0405 -0.2239  0.1696
 -0.7078]


### Standardization Residuals Analysis

In [3]:
sse =np.sum(list(map(lambda x : x**2,err)))

mse = sse/(len(y)-1)
sigma = np.sqrt(mse)
d = []
for i in err:
    d.append(i/sigma)
df2 = pd.DataFrame(d)
df2.index+=1
print(df2.T)
print(f"\nMean Squared Error = {mse}")
for i in d:
    if i>=2 or i<=-2:
        print(f"\n{np.round(i,4)} have to examined.")
        

         1         2         3         4         5         6         7   \
0  1.624198 -1.485889  1.326275  0.535836 -0.437653 -0.288691  0.071906   

         8         9         10  
0 -0.397527  0.301119 -1.256676  

Mean Squared Error = 0.3172300399999997


### Heteroskedasticity Problem

In [4]:
df["rank(x)"] = df["X"].rank()
df["predicts"] = predicts
df["errors"] = err
df["rank(e)"] = df["errors"].rank()
df["di"] = df["rank(e)"]-df["rank(x)"]
df["di^2"] = df["di"]**2
df.T

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
Y,12.0,14.4,14.6,16.0,11.3,10.0,16.2,10.4,13.1,11.3
X,12.0,21.0,18.0,22.0,13.0,10.0,23.0,11.0,16.0,14.0
rank(x),3.0,8.0,7.0,9.0,4.0,1.0,10.0,2.0,6.0,5.0
predicts,11.0852,15.2369,13.853,15.6982,11.5465,10.1626,16.1595,10.6239,12.9304,12.0078
errors,0.9148,-0.8369,0.747,0.3018,-0.2465,-0.1626,0.0405,-0.2239,0.1696,-0.7078
rank(e),10.0,1.0,9.0,8.0,3.0,5.0,6.0,4.0,7.0,2.0
di,7.0,-7.0,2.0,-1.0,-1.0,4.0,-4.0,2.0,1.0,-3.0
di^2,49.0,49.0,4.0,1.0,1.0,16.0,16.0,4.0,1.0,9.0


In [5]:
rs = 1 - ((6*df["di^2"].sum())/(n*(n**2)-n))
rs

0.09090909090909094

$$H_0 : \rho = 0$$<br>$$H_s : \rho \neq 0$$

In [6]:
t = ((rs)*(np.sqrt(n-2)))/(np.sqrt(1-rs**2))
t

0.25819888974716126

In [7]:
import scipy.stats as ss
th = ss.t.ppf(q=0.05,df=n-2) #1.94
if th>t:
    print("You fail to reject the null hypothesis. There is not heteroskedasticity.")
else:
    print("You can reject the null hypothesis. There is heteroskedasticity. ")


You can reject the null hypothesis. There is heteroskedasticity. 


### Autocorrelation

In [8]:
et = []
for i in range(n):
    et.append(err[i]*err[i-1])
df["etet_1"] = et
df

Unnamed: 0,Y,X,rank(x),predicts,errors,rank(e),di,di^2,etet_1
1,12.0,12,3.0,11.0852,0.9148,10.0,7.0,49.0,-0.647495
2,14.4,21,8.0,15.2369,-0.8369,1.0,-7.0,49.0,-0.765596
3,14.6,18,7.0,13.853,0.747,9.0,2.0,4.0,-0.625164
4,16.0,22,9.0,15.6982,0.3018,8.0,-1.0,1.0,0.225445
5,11.3,13,4.0,11.5465,-0.2465,3.0,-1.0,1.0,-0.074394
6,10.0,10,1.0,10.1626,-0.1626,5.0,4.0,16.0,0.040081
7,16.2,23,10.0,16.1595,0.0405,6.0,-4.0,16.0,-0.006585
8,10.4,11,2.0,10.6239,-0.2239,4.0,2.0,4.0,-0.009068
9,13.1,16,6.0,12.9304,0.1696,7.0,1.0,1.0,-0.037973
10,11.3,14,5.0,12.0078,-0.7078,2.0,-3.0,9.0,-0.120043


In [10]:
rho = df["etet_1"].sum()/sse
rho

-0.7077911838221732

There is negative autocorrelation