<a href="https://colab.research.google.com/github/profteachkids/CHE5136_Fall2021/blob/main/EqualAreaHistogram_NonInteger_Edges_inclass_with_comments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from plotly.subplots import make_subplots
import plotly.io as pio
import pandas as pd
pio.templates.default='plotly_dark'
import scipy.stats as stats

In [18]:
data=np.random.normal(size=int(1e2))
data_sorted = np.sort(data)
n=data_sorted.size
nbins = 7
nperbin = int(n//nbins)
x = np.arange(0,n)   # generated an "independent variable" for our sorted data, to enable linear regression and interpolation of bin edges
edges = np.linspace(0,n,nbins+1)  # generated edge locations spaced evenly along the "independent" variable
offsets = np.arange(-4,4)  # offset indices from edge location
indices = (edges[1:-1,None] + offsets[None,:]).astype(np.int64)  # At each internal edge location, generate set of offset indices for linear regression
Y=np.take(data_sorted,indices)  # grab data at offset index locations around each internal edge
X=np.stack((indices,np.ones_like(indices)),axis=2)  # generate design matrix around each internal edge
XT = np.moveaxis(X, (0,1,2), (0,2, 1))  # transpose last two axis for a 3D matrix.  First axis is a batch index for each internal edge.
pinv = np.linalg.inv(XT @ X) @ XT  # Moore-Penrose inverse for the linear regression
mb=np.einsum('ajk,ak->aj',pinv,Y)  # Calculate slope and intercept for each batch, in einsum, index "a" is the batch index.
interpolated_edges = np.r_[data_sorted[0], mb[:,0]*edges[1:-1]+mb[:,1], data_sorted[-1]]  #lo/hi edges are min/max of data, interpolate interior edges from slope/intercept
height= 1./nbins/(interpolated_edges[1:]-interpolated_edges[:-1]) #Total histogram area normalized to 1, each bin has area of 1/nbins, height=area/width
bin_edges=np.repeat(interpolated_edges,2) #At each edge, drawing a vertical line, so x values must be repeated twice
bin_heights=np.r_[0,np.repeat(height,2),0] #Ends of histogram are at zero height.  Connect the tops of histograms with horizontal lines, so repeat heights twice.

In [22]:
pinv

array([[[-0.08333333, -0.05952381, -0.03571429, -0.01190476,
          0.01190476,  0.03571429,  0.05952381,  0.08333333],
        [ 1.25      ,  0.92857143,  0.60714286,  0.28571429,
         -0.03571429, -0.35714286, -0.67857143, -1.        ]],

       [[-0.08333333, -0.05952381, -0.03571429, -0.01190476,
          0.01190476,  0.03571429,  0.05952381,  0.08333333],
        [ 2.41666667,  1.76190476,  1.10714286,  0.45238095,
         -0.20238095, -0.85714286, -1.51190476, -2.16666667]],

       [[-0.08333333, -0.05952381, -0.03571429, -0.01190476,
          0.01190476,  0.03571429,  0.05952381,  0.08333333],
        [ 3.58333333,  2.5952381 ,  1.60714286,  0.61904762,
         -0.36904762, -1.35714286, -2.3452381 , -3.33333333]],

       [[-0.08333333, -0.05952381, -0.03571429, -0.01190476,
          0.01190476,  0.03571429,  0.05952381,  0.08333333],
        [ 4.83333333,  3.48809524,  2.14285714,  0.79761905,
         -0.54761905, -1.89285714, -3.23809524, -4.58333333]],

       [

In [23]:
Y

array([[-1.4511915 , -1.44825084, -1.44620818, -1.2802742 , -1.19784868,
        -1.14952243, -1.11848624, -1.05328707],
       [-0.86028903, -0.85984787, -0.84294278, -0.83312512, -0.83139335,
        -0.80990919, -0.79553789, -0.74116411],
       [-0.57204941, -0.56324737, -0.53975568, -0.35557123, -0.35117853,
        -0.33865084, -0.33115921, -0.33020441],
       [ 0.06386101,  0.11477662,  0.14232988,  0.22372258,  0.23994594,
         0.24820624,  0.25020775,  0.25100415],
       [ 0.49520542,  0.51752307,  0.53448552,  0.53880552,  0.59681202,
         0.60610738,  0.68171072,  0.69109344],
       [ 0.97454049,  0.97689354,  1.03057536,  1.12539997,  1.18773652,
         1.24749687,  1.27395878,  1.40646255]])

In [19]:
fig=make_subplots()
fig.add_scatter(x=bin_edges,y=bin_heights, mode='lines')

In [11]:
mb

array([[ 0.02977206, -1.39448126],
       [ 0.04817954, -1.83172924],
       [ 0.01645688, -0.85071724],
       [ 0.03039483, -1.59788228],
       [ 0.02625863, -1.3013564 ],
       [ 0.05351685, -3.53361193]])

In [None]:
3,2

In [None]:
np.linalg.pinv(X)

array([[[ 2.38095238e-02, -5.83333333e-01],
        [-5.83333333e-01,  1.44166667e+01]],

       [[ 2.38095238e-02, -1.17857143e+00],
        [-1.17857143e+00,  5.84642857e+01]],

       [[ 2.38095238e-02, -1.77380952e+00],
        [-1.77380952e+00,  1.32273810e+02]]])

In [None]:

    XT = np.moveaxis(X,(0,1,2), (0,2,1))
    pinv=np.linalg.inv(XT@X) @ XT 
    mb=np.einsum('ijk,ik->ij',pinv,Y)
    smoothed_edges = np.r_[(np.min(data), mb[:,0]*edges[1:-1] + mb[:,1], np.max(data))]

(3, 8)

array([[[21,  1],
        [22,  1],
        [23,  1],
        [24,  1],
        [25,  1],
        [26,  1],
        [27,  1],
        [28,  1]],

       [[46,  1],
        [47,  1],
        [48,  1],
        [49,  1],
        [50,  1],
        [51,  1],
        [52,  1],
        [53,  1]],

       [[71,  1],
        [72,  1],
        [73,  1],
        [74,  1],
        [75,  1],
        [76,  1],
        [77,  1],
        [78,  1]]])

In [None]:
v = inv(X.T @ X) @ XT @ Y

In [None]:
fig = make_subplots(rows=1,cols=1)
fig.add_scatter(x=x, y=data_sorted, mode='markers')



In [None]:
equal_area_histogram(data,10)

10

In [None]:
def equal_area_histogram(data, nbins=None):
    data_sorted = np.sort(data)
    n=data_sorted.size
    nbins = int(2*n**(2/5)) if nbins is None else nbins

    x = np.arange(0,n)
    edges = np.linspace(0,n,nbins+1)
    offsets = np.arange(np.ceil(-nperbin/4),np.floor(nperbin/4))
    indices=(edges[1:-1,None] + offsets[None,:]).astype(np.int64)
    Y=np.take(data_sorted,indices)
    X=np.stack((indices,np.ones_like(indices)),axis=2)
    XT = np.moveaxis(X,(0,1,2), (0,2,1))
    pinv=np.linalg.inv(XT@X) @ XT 
    mb=np.einsum('ijk,ik->ij',pinv,Y)
    smoothed_edges = np.r_[(np.min(data), mb[:,0]*edges[1:-1] + mb[:,1], np.max(data))]
    height = 1/nbins/(smoothed_edges[1:]-smoothed_edges[:-1])
    bin_edges=np.repeat(smoothed_edges,2)
    bin_heights=np.r_[0.,np.repeat(height,2),0.]

    return bin_edges, bin_heights

##Central Limit Theorem Demonstration

In [None]:
lo,hi,n = 4.5, 5.5, 100

uniform_std = (hi-lo)/(12**0.5)
data=np.mean(np.random.uniform(lo,hi, size=(int(1e6),n)),axis=1)
x,y=equal_area_histogram(data)

nxdf=100
xdf = np.linspace(np.min(data),np.max(data),nxdf)
cdf=stats.norm.cdf(xdf, (lo+hi)/2, uniform_std/n**0.5)
heights = (cdf[1:]-cdf[:-1])/((np.max(data)-np.min(data))/(nxdf-1))

fig2=make_subplots()
fig2.add_scatter(x=x, y=y, mode='lines', name='simulated data')
fig2.add_scatter(x=(xdf[1:]+xdf[:-1])/2, y=heights, mode='lines', name =f'theory')
fig2.update_layout(width=800,height=600,legend=dict(y=0.9, xanchor='right', x=0.95, font_size=20),
                                      title=dict(text=f'Normal Distribution',y=0.9, x=0.5, xanchor='center', yanchor='top'))

#Small Sample Size (t-Distribution)

In [None]:
min,max=-5,5
n=3
raw_data=np.random.normal(loc=0, scale=1, size=(int(1e5),n))
raw_data_mean = np.mean(raw_data,axis=1)
raw_data_std = np.std(raw_data,axis=1, ddof=1)
data = raw_data_mean/(raw_data_std/n**0.5) 
x,y=equal_area_histogram(data)

nxdf=100
xdf = np.linspace(min,max,nxdf)
normal_cdf=stats.norm.cdf(xdf, loc=0, scale=1)
normal_heights = (normal_cdf[1:]-normal_cdf[:-1])/((max-min)/(nxdf-1))

t_cdf=stats.t.cdf(xdf, df=n-1, loc=0, scale=1)
t_heights = (t_cdf[1:]-t_cdf[:-1])/((max-min)/(nxdf-1))

fig3=make_subplots()
fig3.add_scatter(x=x, y=y, mode='lines', name='simulated data')
fig3.add_scatter(x=(xdf[1:]+xdf[:-1])/2, y=normal_heights, mode='lines', name =f'normal')
fig3.add_scatter(x=(xdf[1:]+xdf[:-1])/2, y=t_heights, mode='lines', name =f't-dist')
fig3.update_layout(width=800,height=600,legend=dict(y=0.9, xanchor='right', x=0.95, font_size=20),
                                      title=dict(text=f'Normal Distribution',y=0.9, x=0.5, xanchor='center', yanchor='top'))
fig3.update_xaxes(range=[min,max])

#Chi-Square Distribution: Sum of Squares of Normally Distributed Random Numbers

In [None]:
df=10

data = np.sum((np.random.normal(loc=0.,scale=1., size=(int(1e6),df)))**2,axis=1)
x,y=equal_area_histogram(data)


nxdf=100
xdf = np.linspace(0,np.max(data),nxdf)
cdf=stats.chi2.cdf(xdf,df)
heights = (cdf[1:]-cdf[:-1])/(np.max(data)/(nxdf-1))
fig4=make_subplots()
fig4.add_scatter(x=x, y=y, mode='lines', name='simulated data')
fig4.add_scatter(x=(xdf[1:]+xdf[:-1])/2, y=heights, mode='lines', name =f'theory')
fig4.update_layout(width=800,height=600,legend=dict(y=0.9, xanchor='right', x=0.95, font_size=20),
                                      title=dict(text=f'Chi Square Distribution df={df}',y=0.9, x=0.5, xanchor='center', yanchor='top'))

#F Distribution: Ratio of Sum of Squares of Normally Distributed Random Numbers

In [None]:
dfn=10
dfd=10
plot_max = 5
data = (np.sum((np.random.normal(loc=0.,scale=1., size=(int(1e6),dfn)))**2,axis=1)/dfn) / (np.sum((np.random.normal(loc=0.,scale=1., size=(int(1e6),dfd)))**2,axis=1)/dfd)
x,y=equal_area_histogram(data)

nxdf=100
xdf = np.linspace(0,plot_max,nxdf)
cdf=stats.f.cdf(xdf,dfn,dfd)
heights = (cdf[1:]-cdf[:-1])/(plot_max/(nxdf-1))

fig5=make_subplots()
fig5.add_scatter(x=x, y=y, mode='lines', name='simulated data')
fig5.add_scatter(x=(xdf[1:]+xdf[:-1])/2, y=heights, mode='lines', name =f'theory')
fig5.update_layout(width=800,height=600,showlegend=True,legend=dict(y=0.9, xanchor='right', x=0.95, font_size=20),
                   title=dict(text=f'F-Distribution dfn={dfn}, dfd={dfd}',y=0.9, x=0.5, xanchor='center', yanchor='top'))
fig5.update_xaxes(range=[0,plot_max])