In [1]:
import pandas as pd
import numpy as np
from pylab import polyfit  # uses better numerical method rather than brute force normal equation computation

import panel as pn
pn.extension()

import holoviews as hv
hv.extension('bokeh', logo=None)

hv.Store.add_style_opts(hv.ErrorBars, ['lower_head', 'upper_head'], 'bokeh')

<div style="float:center;width:100%;text-align: center;"><strong style="height:100px;color:darkred;font-size:40px;">Regression: an Application of the Normal Equations</strong></div>

Statistical plots using holoviews that may be of interest:<br>
https://github.com/ea42gh/HoloviewsPlayground/blob/master/StatisticalPlots.ipynb

# 1. Create some data

In [2]:
np.random.seed( 123321 ) # this just fixes the values returned by the random number generator
                         # (use whatever integer for the 'seed', or remove this call)
def get_data(N=7):
    ''' Create a data set of 20 measurements'''
    x       = np.linspace(0,N-1,N)
    #y_exact = 2*x + 1.              ##########################  <- function y = 1+ 2 x
    y_exact = 0.1*x*x +2*x + 1.      ##########################  <- function y = 1+ 2 x + 0.1 x^2
    y       = y_exact + 8*np.random.normal(size=N)

    df= pd.DataFrame( {'x':       x,
                       'y':       y,
                       'y_exact': y_exact,
                      })
    return df
df = get_data()
df.tail(5)

Unnamed: 0,x,y,y_exact
2,2.0,15.542752,5.4
3,3.0,-3.154799,7.9
4,4.0,6.212741,10.6
5,5.0,16.22355,13.5
6,6.0,27.564125,16.6


In [3]:
# Plot routine: fitted polynomial
def plot_poly( params, x_i, y_i, e=True ):
    '''given model parameters params and x values x_i, plot the fitted polynomial'''

    # compute dense x,y values of the poly for display of a smooth curve
    x = np.linspace( min(x_i), max(x_i), 400)
    y = params[0]+params[1]*x
    for i in range(2,len(params)):
        y += params[i]*x**i

    # compute values of the poly for df['x']
    y_hat = params[0] + params[1]*x_i
    for i in range(2,len(params)):
        y_hat += params[i]*x_i**i

    # -----------------------------------------------------------------------
    h1 = hv.Scatter((x_i, y_hat), label='Estimate of y' ).opts(color='blue', size=4)
    h2 = hv.Curve((x,y), label='fitting polynomial').opts(color='blue', line_width=0.8)
    if e:
        h3 = hv.ErrorBars(np.stack([x_i,y_hat,0*y_hat,y_i-y_hat], axis=1),
                        kdims='x',vdims=['y', ' ', 'error'],    group='Measured', label='errors')\
               .opts(lower_head = None, upper_head = None)
        h = h1*h2*h3
    else:
        h = h1*h2

    return h

## 1.1 Pandas DataFrame

This table is a pandas 'DataFrame': it organizes data into columns.

One can do computations with columns, e.g., find the random value we added at each point x:
* df['y'] - df['y_exact']
Once can add columns:
* df['ones'] = np.ones(20)

**Remark:** Multiplication of a column just multiplies the entries in each of the rows (no dot products...)

In [4]:
df['x0'] = np.ones(len(df))
df['x1']   = df['x'] # uniform naming convention for the powers of x allows simplifying the logic below
for i in range(2, 10):
    df[f'x{i}']   = df[f'x{i-1}']*df['x']
df.head(3)

Unnamed: 0,x,y,y_exact,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9
0,0.0,8.241605,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,-1.873353,3.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,2.0,15.542752,5.4,1.0,2.0,4.0,8.0,16.0,32.0,64.0,128.0,256.0,512.0


We can look at a subset of the columns, and better yet, make them into an array:
* df[['x0','x1']]

In [5]:
A = df[[ 'x0','x1' ]]
A.head(4)

Unnamed: 0,x0,x1
0,1.0,0.0
1,1.0,1.0
2,1.0,2.0
3,1.0,3.0


In [6]:
A = df[['x0','x1']].to_numpy()
A[0:3,:]  # first three rows

array([[1., 0.],
       [1., 1.],
       [1., 2.]])

## 1.2 Take a look at the data

In [7]:
h = hv.Scatter(df, 'x', 'y', label='measured' ).opts(size=6,color='red')*hv.Curve(df, 'x', 'y_exact', label='exact').opts(color='red', line_width=0.5)
h = h.opts(legend_position='right', width=600, title="Measurement and Exact Curve y versus x")
h

# 2. Let's fit some model

## 2.1 Let's try a line:  y = a + b x

Look at our table: each row should satisfy this equation<br>
(it does not, the data do not fall on a line)

In [8]:
df[['x','y']].head(3)

Unnamed: 0,x,y
0,0.0,8.241605
1,1.0,-1.873353
2,2.0,15.542752


I.e.,
$$\left. \begin{align}
a + 0 b &\approx \;\; 8.2 \\
a + 1 b &\approx -1.9 \\
a + 2 b &\approx 15.5 \\
\dots &
\end{align} \ \right\} \; \Leftrightarrow
A x = y  + e \quad \Leftrightarrow
A = \begin{pmatrix} 1 & x_0 \\ 1 & x_1 \\ 1 & x_2 \\ \dots & \dots \end{pmatrix},\;\;x = \begin{pmatrix} a \\ b \end{pmatrix}, $$
and  $e$ is the error (the deviation from the exact curve)

**The idea:** let's choose the coefficients to make $\sum_i{e_i^2}$ as small as possible: "Min Least Squares" 

Looking at our equations from a linear algebra perspective, we see a picture in the codomain:<br>$\quad$ $A x $ is some vector in $\mathscr{C}(A)$,
and $y$ is some vector pointing out of that column space!

We want $\lVert e \rVert = \lVert y - A x \rVert$ to be a minimum: $x$ is the solution of the normal equation!

### 2.1.1 Solve the normal equation for $x$

In [9]:
A = df[['x0','x1']].to_numpy()
b = df['y'].to_numpy()

AtA = A.T @ A; Atb = A.T @ b
x = np.linalg.solve( AtA, Atb)
print("a_hat,b_hat = ", x)

a_hat,b_hat =  [0.73330088 3.02969118]


These are our estimates $\hat{a}, \; \hat{b}$ for the values of $a$ and $b$

Our estimate of the true value of $y$ is
$\boxed{\hat{y} = \hat{a} + \hat{b} x}$

In [10]:
df['yhat'] = x[0] + x[1]*df['x']
df.head(3)

Unnamed: 0,x,y,y_exact,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,yhat
0,0.0,8.241605,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.733301
1,1.0,-1.873353,3.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.762992
2,2.0,15.542752,5.4,1.0,2.0,4.0,8.0,16.0,32.0,64.0,128.0,256.0,512.0,6.792683


In [11]:
# Let's add out estimate to the previous plot
h_lin = h * plot_poly(x,df['x'], df['y'])
#h_lin = h * hv.Curve(df, 'x', 'yhat', label='Estimate of y' ).opts(color='blue', line_width=0.8)
h_lin.opts(width=700,legend_position='right',title='Linear Model')

How big is $\lVert e \rVert$?

In [12]:
e = df['yhat']-df['y']
print( 'norm(e) =', np.linalg.norm(e))

norm(e) = 21.26452478298913


## 2.2 Model y =a + b x + c x^2 + d sin(.1x)

We can try other models, e.g., add a sine term: the analysis stays the same, it's just our matrix that grew bigger!<br>
The requirement is that the **dependence of the model on the parameters be linear** ($a,b,c,d$ appear as multipliers of functions of $x$)

In [13]:
df['sin'] = 0.1*df['x']
A=df[['x0','x1','x2','sin']].to_numpy()
A[0:4,:]

array([[1. , 0. , 0. , 0. ],
       [1. , 1. , 1. , 0.1],
       [1. , 2. , 4. , 0.2],
       [1. , 3. , 9. , 0.3]])

In [14]:
AtA = A.T @ A; Atb = A.T @ b
x = np.linalg.solve( AtA, Atb)
print(x)

[  8.25600127 -11.8751135    1.50454008  58.77564196]


In [15]:
df['yhat3'] = x[0] + x[1]*df['x1'] + x[2]*df['x2'] + x[3]*df['sin']
df.head(3)

Unnamed: 0,x,y,y_exact,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,yhat,sin,yhat3
0,0.0,8.241605,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.733301,0.0,8.256001
1,1.0,-1.873353,3.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.762992,0.1,3.762992
2,2.0,15.542752,5.4,1.0,2.0,4.0,8.0,16.0,32.0,64.0,128.0,256.0,512.0,6.792683,0.2,2.279063


In [16]:
h_cub = h * hv.Curve(df, 'x', 'yhat3', label='Estimate of y' ).opts(color='blue', line_width=0.8)
h_cub.opts(width=700,legend_position='right',title='Model y = a + b x + c x^2 + d sin(0.1 x)')

In [17]:
e3 = df['yhat3']-df['y']
print( 'norm(e3) =', np.linalg.norm(e3))

norm(e3) = 16.187469929662782


## 2.3 Model y =a + b x + c x^2 + d x^3 +e x^4 + f x^5

In [18]:
A=df[[f'x{i}' for i in range(6)]].to_numpy()
A[0:4,:]

array([[  1.,   0.,   0.,   0.,   0.,   0.],
       [  1.,   1.,   1.,   1.,   1.,   1.],
       [  1.,   2.,   4.,   8.,  16.,  32.],
       [  1.,   3.,   9.,  27.,  81., 243.]])

In [19]:
AtA = A.T @ A; Atb = A.T @ b
x = np.linalg.solve( AtA, Atb)
with np.printoptions(precision=2):
    print("parameters:", x)

parameters: [  7.87 -49.77  69.07 -32.7    6.27  -0.42]


In [20]:
df['yhat5'] = x[0]*df['x0'] + x[1]*df['x1'] + x[2]*df['x2'] + x[3]*df['x3'] + x[4]*df['x4'] + x[5]*df['x5']
df.head(3)

# We could rewrite this using a matrix multiplication
# df['yhat5'] =  df[[f'x{i}' for i in range(6)]] @ x

Unnamed: 0,x,y,y_exact,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,yhat,sin,yhat3,yhat5
0,0.0,8.241605,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.733301,0.0,8.256001,7.874578
1,1.0,-1.873353,3.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.762992,0.1,3.762992,0.328809
2,2.0,15.542752,5.4,1.0,2.0,4.0,8.0,16.0,32.0,64.0,128.0,256.0,512.0,6.792683,0.2,2.279063,10.037347


In [21]:
h_quint = h * plot_poly(x,df['x'], df['y'], e=True) #hv.Curve(df, 'x', 'yhat5', label='Estimate of y' ).opts(color='blue', line_width=0.8)
h_quint.opts(width=700,legend_position='right',title='Quintic Model')

In [22]:
e5 = df['yhat5']-df['y']
print( 'norm(e5) =', np.linalg.norm(e5))

norm(e5) = 11.156654243852202


# Some Comments

As we increase the degree of the polynomial, we obtain curves that pass closer and closer to the measured values:<br>
When we use a polynomial model that has the same number of parameters as the number of measurement points,<br> 
the polynomial passes through every one of these points
* we have modeled the data exactly: the error is 0, contray to our initial assumption! We have **overfitted the data**
* the polynomial will be forced to oscillate.
* the **oscillations can be mitigated** if we are free to choose the $x$ values:<br>
rather than choosing a uniform partition of the $x$ interval, an optimal choice are Chebyshev points: a cosine sampled at a uniform partition of the angle

One other point to stress: the **fit is not valid outside the range of the x values!** E.g., look at the intercept of the estimates with the x axis for the linear model we started with!

# Better Sample Points: Chebyshev Nodes

In [23]:
def c_nodes( x1, xn, N):
    '''sample values x'''
    return 0.5*(x1+xn) + 0.5*(xn-x1)*np.array([np.cos(np.pi*(2*k-1)/(2.*N)) for k in range(1,N+1)])

N=30
x_sample = c_nodes(-1,1,N)
hv.Spikes(x_sample).opts(width=500,height=120,yticks=1,title='Good sample values for x')

The data frame implementation above was convenient, but gets harder to handle if we want to experiment with various models.<br>
It will be simpler to construct the $A$ matrix directly.

For polynomial models of degree $d$, the matrix is $\mathbf{A = (1\; x\; x^2\;\dots x^d )}$.<br> This is known as a **Vandermonde matrix.**

* The estimated error is $e = \hat{y} - y$, the difference of the y value on the fitted polynomial and the measured y value<br>
I computed these values in the solve routine: they are
$\hat{y} = A x, \hat{e} = y - \hat{y}$
* I also provided more sophisticated plots and used panel to lay them out on a grid

In [24]:
class NormalEquationFit:
    def __init__( self, x_sample, y_sample, true_y = None, typ="Uniform Sampling" ):
        self.typ        = typ
        self.x_sample   = x_sample
        self.y_sample   = y_sample
        self.true_y     = true_y
        self.true_error = None if true_y is None else y_sample - true_y
        self.degree     = None

        self.A          = np.vander( x_sample, increasing=True)
        self.x_rng      = (min(x_sample), max(x_sample))
        self.x_fine     = np.linspace(*self.x_rng, 400 ) # used to plot the polynomial at sufficiently fine resolution
        
        self._plot_init()

        #.opts(title=f'Uniform Sampling, Polynomial of degree {d}',legend_position='top', width=600)


    def solve(self, d):
        self.degree = d
        d          += 1
        Ad          = self.A[:,0:d]
        AdtAd       = Ad.T @ Ad; Adtb = Ad.T @ self.y_sample

        try:
            params = np.linalg.solve( AdtAd, Adtb )
        except:
            params      = np.zeros(shape=d)   # the solver failed. Let's just zero out everything...
            self.degree = 0
        self.poly           = np.poly1d(params[::-1])

        self.y_fine         = self.poly( self.x_fine )
        self.y_estimate     = self.poly( self.x_sample )
        self.error_estimate = self.y_sample - self.y_estimate

    def _plot_init(self):
        self.h_samples = hv.Scatter((self.x_sample, self.y_sample), label='measured y').opts( size=4, color='red', muted_alpha=0.01)
        if self.true_y is not None:
            self.h_true  = hv.Curve((self.x_sample, self.true_y), label='truth').opts(color='red', muted_alpha=0.01, line_width=.6)

    def _fit_plot(self):
        h_estimate     = hv.Scatter((self.x_sample,self.y_estimate), label='estimate').opts(color='blue', size=4, muted_alpha=0.01) *\
                         hv.Curve( (self.x_fine, self.y_fine), label='estimate').opts(color='blue', line_width=.6, muted_alpha=0.01)
        h_error_bars   = hv.ErrorBars( (self.x_sample,self.y_estimate,np.zeros(len(self.x_sample)), self.error_estimate), vdims=['y','0','e'])\
                           .opts( lower_head = None, upper_head = None)
        h = self.h_samples * h_estimate * h_error_bars
        if self.true_y is not None:
            h = h * self.h_true
        return h.opts( title=f'{self.typ}, Polynomial of degree {self.degree}', width=600, height=300, show_grid=True, legend_position='left')
    
    def _error_plot(self):
        def kde_scipy(x, x_grid, bandwidth=0.2, **kwargs):
            """Kernel Density Estimation with Scipy"""
            # Note that scipy weights its bandwidth by the covariance of the
            # input data.  To make the results comparable to the other methods,
            # we divide the bandwidth by the sample standard deviation here.

            from scipy.stats import gaussian_kde

            kde = gaussian_kde(x, bw_method=bandwidth / x.std(ddof=1), **kwargs)
            return kde.evaluate(x_grid)

        s      = self.error_estimate
        s_grid = np.linspace(-4.5, 4.5, 100)
        h = \
        hv.Histogram( np.histogram( s, 10, density=True), kdims='error' ).opts(fill_color='slateblue', alpha=0.4 ) * \
        hv.Curve((s_grid, kde_scipy( s, s_grid ))).opts( color = 'red')
        return h.opts(title='Estimated Error Distribution')

    def plot( self, d ):
        self.solve(d)
        return pn.Row(self._fit_plot(), self._error_plot())

In [26]:
def gen_pb(N, typ=0, err = 0.8, sz=10. ):
    if typ == 0:
        typ      = "Uniform Sampling"
        x_sample = np.linspace(-2,2,N)
    else:
        typ      = "Chebyshev Sampling"
        x_sample = c_nodes    (-2,2,N)

    y_exact   = np.array([ sz/(1+x*x) for x in x_sample ])
    y_sample  = y_exact + err*np.random.normal(size=len(x_sample))
    fit       = NormalEquationFit( x_sample, y_sample, y_exact, typ )
    return  fit

# ==============================================================================
N = 50; err = 2.8
uniform_fit    = gen_pb( 50, 0, err )
chebyshev_fit  = gen_pb( 50, 1, err )

pn.interact( lambda d: pn.Column(uniform_fit.plot(d),chebyshev_fit.plot(d)),
             d=pn.widgets.IntSlider(start=1,end=N-1,value=N//2, name="Degree of the polynomial") )

# And so...

Care to **play with this?** Change the data, the models, the number of samples, the distributions of the errors<br>
How are the error estimates distributed? Etc...

**Programming Exercise:** add a slider for the bandwidth used in the kernel density computation:
* you will need to make bandwidth a parameter of the error plot function
* you will then have to add the parameter to the plot function, and pass the value in
* finally, you will need to change the interact call:<br>
the plotting function has 2 parameters (degree and bandwidth),<br>
and you want to define a panel slider to set the bandwidth as well