# Central Limit Theorem (CLT)

This notebook displays the effect of CLT as more data points are sampled. The Central Limit Theorem states that when independent random variables are added, their properly normalized sum tends toward a normal distribution (informally a "bell curve") even if the original variables themselves are not normally distributed. (https://en.wikipedia.org/wiki/Central_limit_theorem)
<br>
Running the code chunk below, setting the parameters with slidebars and pressing "Run Interact" will plot graphs to display the effect of CLT. 
<br>
In this applet, we will sample data points from an exponential distribution.
<br>
mean: true mean of the exponential distribution
<br>
N: the number of points to get each average value with. (e.g. N=5 will average over groups of 5)

In [7]:
import numpy as np
import math 
import matplotlib.pyplot as plt
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual

def CLT(mean, N):
    
    ld = 1.0/mean
    
    print ("First, the data would be sampled from the following exponential distribution.")
    print ("The mean of your choice is shown with the vertical green line.")
    x = np.linspace(0, 7, 100)
    y = ld * np.exp(-ld * x)
    plt.plot(x, y)
    plt.xlabel("x")
    plt.ylabel("P(x)")
    plt.title("exponential distribution with lambda: " + str(round(ld, 2)))
    plt.axvline(x=1 / float(ld), color='g', label='mean(1/lambda) = ' + str(round(1/float(ld), 2)))
    plt.legend()
    plt.show()    
    
    rds = np.random.exponential(mean, size=300)
    avg3s = []
    for i in range(100):
        avg3s.append(np.mean(rds[3*i:3*(i+1)+1]))

    print ("The histogram of the sample averages (by groups of 3) can be compared with the original distribution.")
    print ("300 data points were sampled in total, and each group of 3 produced an average, for example:")
    for i in range(4):
        print ("samples "+str(3*i+1)+"~"+str(3*(i+1))+": ", rds[3*i:3*(i+1)], "-> average:", round(avg3s[i], 2))
    print ("...")
    print ("samples 298~300:", rds[297:], "-> average:", round(avg3s[-1], 2))        

    print ("\nAnd these are the 100 sample averages (of 3) that will be drawn below:")
    print (map(lambda x: round(x, 2), avg3s))
    
    rds = np.random.exponential(mean, size=100*N)
    avgNs = []
    for i in range(100):
        avgNs.append(np.mean(rds[N*i:N*(i+1)]))         

    print ("\nYou can also test with different Ns by using the slidebar. With the current N=" + str(N) + \
          ", these are the 100 sample averages (of " + str(N) + ") that will be drawn below:")
    print (map(lambda x: round(x, 2), avgNs))
    
    plt.subplots(1, 2, figsize=(14, 5))
    
    plt.subplot(1, 2, 1)
    samplemean3 = np.mean(avg3s)
    samplevar3 = np.var(avg3s)
    x3 = np.linspace(0, 7, 100)
    y3 = [ 1 / math.sqrt(2 * math.pi * samplevar3) * math.exp( - (xx-samplemean3)**2 / (2 * samplevar3) ) for xx in x3 ]
    plt.plot(x3, y3, label="normal distribution for averages of 3")        
    plt.hist(avg3s, label="averages of 3 samples", normed=True)
    plt.legend()
    plt.xlabel("values of samples/averages")
    plt.ylabel("relative frequency")

    plt.subplot(1, 2, 2)
    samplemeanN = np.mean(avgNs)
    samplevarN = np.var(avgNs)
    xN = np.linspace(0, 7, 100)
    yN = [ 1 / math.sqrt(2 * math.pi * samplevarN) * math.exp( - (xx-samplemeanN)**2 / (2 * samplevarN) ) for xx in xN ]
    plt.plot(xN, yN, label="normal distribution for averages of N")    
    plt.hist(avgNs, label="averages of N=" + str(N) + " samples", normed=True)
    plt.legend()
    plt.xlabel("values of averages")
    plt.ylabel("relative frequency")
    plt.show()
    
    print ("The histograms of 3-averages and N-averages are shown above.")    
    print ("Observe that although the data were sampled from an exponential distribution,")
    print ("the distribution of averages look closer to a normal distribution(bell curve) as N gets larger.")

interact_manual(CLT, mean=(0.5, 3), N=(1, 50))

<function __main__.CLT>