# Consistency & Asymptotic Normality of Simple Linear Regression (SLR)

This notebook displays two properties of simple linear regression's estimator: $\hat{\beta}$.

1. Consistency: $\hat{\beta} \rightarrow \beta$ with large enough N (number of sampled data points).
2. Asymptotic Normality: the distribution of $\hat{\beta}$s would resemble a normal distribution with large enough N.

Run the code chunk below, adjust the slidebars and press "Run Interact" to create the visualizations. 

In [2]:
import numpy as np
import math 
import matplotlib.pyplot as plt
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
import scipy.stats as stats
from sklearn import datasets, linear_model

def SLR(N):

    # generate random 2d data
    beta0 = 0.5
    beta1 = 2.0

    print ("The set of N=" +str(N)+ " sampled points shown below are sampled based on Y=2X+0.5 with noise")
    print ("The orange line indicates a fitted regression line based on the sample. It may not exactly be equal to Y=2X+0.5")
    
    sampleX = np.random.rand(N)
    sampleY = np.array([beta0 + x * beta1 + -2 + np.random.rand(1)*4 for x in sampleX])
    sampleX = sampleX.reshape(N, 1)
    sampleY = sampleY.reshape(N, 1)
    plt.plot(sampleX, sampleY, 'o')
    
    regr = linear_model.LinearRegression()
    regr.fit(sampleX.reshape(N, 1), sampleY.reshape(N, 1))

    xx = np.linspace(0, 1, 300)
    yy = [regr.intercept_ + x * regr.coef_[0] for x in xx]
    plt.plot(xx, yy)
    plt.xlabel("X")
    plt.ylabel("Y")
    plt.show()

    betahats = []
    meanbetahats = []
    for i in range(2, N):
        sX = sampleX[:i+1]
        sY = np.array([beta0 + x * beta1 + -1 + np.random.rand(1)*2 for x in sX])
        xbar = np.mean(sX)
        ybar = np.mean(sY)

        cov = np.cov(np.vstack((sX.reshape(1, i+1), sY.reshape(1, i+1))))
        betahat = cov[0][1] / cov[0][0]
        betahats.append(betahat)
        meanbetahats.append(np.mean(betahats))

        
    print ("The plot below displays how betahat converges to the true beta=2,")
    print ("as more and more samples are used to fit a regression line.")
    print ("Thus, the estimator betahat is consistent.")
        
    plt.plot(meanbetahats)
    plt.xlabel("N (number of samples used)")
    plt.ylabel("betahat of fitted line")
    plt.axhline(y=2, color='r')
    plt.show()

    
    betahats = []
    for i in range(100):
    
        sampleX = np.random.rand(N)
        sampleY = np.array([beta0 + x * beta1 + -2 + np.random.rand(1)*4 for x in sampleX])
        regr = linear_model.LinearRegression()
        regr.fit(sampleX.reshape(N, 1), sampleY.reshape(N, 1))
        betahats.append(regr.coef_[0][0])
    
    var = np.var(betahats)
    xx = np.linspace(min(betahats), max(betahats), 300)
    yy = [ 1 / math.sqrt(2 * math.pi * var) * math.exp( - (x-2.0)**2 / (2 * var) ) for x in xx ]
    
    print ("Now, N="+str(N) + " points are sampled 100 times, creating 100 fitted lines with betahats")
    print ("The distribution of betahats would resemble a normal distribution with a large enough N (by CLT)")
    
    plt.plot(xx, yy)
    plt.hist(betahats, normed=True)
    plt.xlabel("value of betahats")
    plt.ylabel("relative frequency")
    plt.show()
    
interact_manual(SLR, N=(3, 200))

<function __main__.SLR>