<img src="../../shared/img/slides_banner.svg" width=2560></img>

# Regression 02

In [None]:
import sys

sys.path.append("../../")

from shared.src import quiet
from shared.src import seed
from shared.src import style

In [None]:
from pathlib import Path
import random

from IPython.display import HTML, Image
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc3 as pm
import seaborn as sns
import scipy.stats

In [None]:
%matplotlib notebook

sns.set_context("notebook", font_scale=1.7)

import shared.src.utils.util as shared_util

In [None]:
def datalikelihood_scatter(ys, xs, intercept, slope, vmin=-100, vmax=0):
    lls = compute_normal_ll(ys, xs, slope, intercept, sigma=1)

    f, ax = plt.subplots(figsize=(10, 8));

    h = ax.scatter(xs, ys, c=lls, vmin=vmin, vmax=vmax);
    
    xs = np.linspace(60, 75);
    ax.plot(xs, predict_height(xs, slope, intercept), lw=4, color="k");
    
    ax.set_xlabel("midparental_height")
    ax.set_ylabel("height"); cb = plt.colorbar(h);
    cb.ax.set_title("Log-Likelihood\n");
    total_ll = int(np.round(sum(lls)))
    ax.set_title(f"Total LL = {total_ll}");
    

def perturb_parameters(slope, intercept):
    return slope + np.random.standard_normal() * 0.01, intercept + np.random.standard_normal() * 0.5


def predict_height(midparental_heights, slope, intercept):
    return slope * midparental_heights + intercept


import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d  # noqa: F401


def datalikelihood_surface(compute_ll, xrange=(60, 75), yrange=(55, 80), num=20):
    xs = np.linspace(*xrange, num=num)
    ys = np.linspace(*yrange, num=num)

    Xs, Ys = np.meshgrid(xs, ys)
    
    Zs = compute_ll(Ys, Xs)
    
    _ = plt.figure(figsize=(8, 8)); ax = plt.axes(projection='3d')
    ax.plot_surface(Xs, Ys, Zs, cmap="hot")
    
    ax.scatter(df["midparental_height"], df["height"], zs=20, c="b");
    ax.set_xlabel("midparental_height")
    ax.set_ylabel("height")
    ax.set_zlabel("Log-Likelihood");# ax.set_zlim([-100, 0]);
    
    return ax


def annotate_normal_likelihood(ax):
    ax.set_xlabel("Observed Value"); ax.set_ylabel("Likelihood\nUnder Normal");
    ax.annotate("Prediction\n$=m\cdot x + b$", xy=(0, 0.4), xytext=(0.5, 0.5),
                arrowprops=dict(facecolor='black')); ax.set_ylim(0, 1);
    ax.set_xticks([]); 

    
def annotate_normal_log(ax):
    ax.set_xlabel("Observed Value"); ax.set_ylabel("Log-Probability\nUnder Normal");
    ax.annotate("Prediction\n$=m\cdot x + b$", xy=(0, -0.75), xytext=(0.5, 0),
                arrowprops=dict(facecolor='black'));
    ax.set_ylim(-6, 2); ax.set_xticks([]); plt.tight_layout()

    
def annotate_normal_ll(ax):
    ax.set_xlabel("Prediction$=m\cdot x + b$"); ax.set_ylabel("Log-Likelihood\nUnder Normal");
    ax.annotate("Observed Value", xy=(observed_value, -0.75), xytext=(observed_value + 0.5, 0),
                arrowprops=dict(facecolor='black'));
    ax.set_ylim(-6, 2); ax.set_xticks([]); 

def make_yerr(predictions, observations):
    errors = predictions - observations
    positive_errors = np.where(errors>0, errors, 0)
    negative_errors = np.where(errors<0, -errors, 0)
    return np.stack([positive_errors, negative_errors])

def make_error_plot(slope, intercept, observation_df):
    f, ax = plt.subplots(figsize=(6, 6))
    ax.scatter(observation_df["midparental_height"], observation_df["height"], s=144, label="observations")
    xs = np.linspace(62, 72)
    ax.plot(xs, predict_height(xs, slope, intercept),
            lw=4, color="k", label="prediction function");

    predictions =  predict_height(observation_df["midparental_height"], slope, intercept)
    yerr = make_yerr(predictions, small_sample["height"])
    ax.errorbar(observation_df["midparental_height"], predictions,
                yerr=yerr, ecolor="r", elinewidth=4, ls="none", zorder=0, label="errors");
    ax.set_ylim(60, 85); ax.legend();
    MSE = round(np.sum(np.square(yerr)) / len(observation_df), 1)
    ax.set_title(f"MSE = {MSE}")

# Regression models relate one continuous variable to the parameters of another.

For an independent variable $x$ and dependent variable $y$, a regression model takes the form

$$
y \sim \text{Foo}\left(\beta= f(x)\right)
$$

where $\text{Foo}$ is some distribution with parameter $\beta$.

We then put a prior over the function $f$ that relates the two variables
and then obtain a posterior that tells us how the two variables are likely to be related.

In an ideal world, we'd put a prior over all possible functions $f$:
say, over all Python programs that take in a number and spit out a number.

But that's impractical, for any of a number of reasons,
and so we work with smaller families of functions:
all possible lines, all possible parabolas, etc.

We then put priors over the parameters that describe that family of functions
(e.g. the intercept and slope in the case of linear regression)
instead of directly on the function.

Choosing "all possible lines" as the family of functions is particularly common and useful.

When we choose that family, we are doing _linear regression_.

# Linear regression is the most important regression model.

In this lecture,
we're taking a deep dive into linear regression:

$$y \sim \text{Foo}(\text{slope}\cdot x + \text{intercept}, \sigma)$$

## Among linear regression models, the case where the likelihood is Normal is particularly important.

So we'll in particular (mostly) focus on the case where the likelihood is Normal:

$$y \sim \text{Normal}(\text{slope}\cdot x + \text{intercept}, \sigma)$$

As with the previous lecture, today we'll work with a famous dataset:
Sir Francis Galton's parent-child height dataset ([source](https://doi.org/10.7910/DVN/T0HSJ1)),
on which the technique of regression was named and invented
([original paper](http://www.stat.ucla.edu/~nchristo/statistics100C/history_regression.pdf)).

In [None]:
df = pd.read_csv("./data/galton_height.csv", index_col=0)

It contains the heights of a nearly 1000 English individuals, their sex, and the height of both their parents, collected in 1885.

Following Galton, we summarize the parental heights by averaging them to obtain a "`midparental_height`".

In [None]:
print(df.head())

In [None]:
sns.jointplot(x="midparental_height", y="height",  data=df, kind="hex");

# As always in modeling, our goal is a posterior.

The posterior represents our updated belief about how the two variables relate to one another,
once we've observed our dataset.

Again as always, the posterior has two pieces:

$$
\color{green}{p(\text{slope}, \text{intercept}, \sigma \vert \text{data})}
\propto \color{darkgoldenrod}{p(\text{data} \vert \text{slope}, \text{intercept}, \sigma)}
\cdot \color{darkblue}{p(\text{slope}, \text{intercept}, \sigma)}
$$

That is, our

$\color{green}{\text{updated belief about the plausibility of a given relationship between x and y}}$

is proportional to

$\color{darkgoldenrod}{\text{how likely the data is under that relationship}}$

multiplied by

$\color{darkblue}{\text{how plausible we thought that relationship was before we saw the data}}$.

# Let's first take a look at the likelihood.

The Normal distribution is, famously, shaped like a bell. 

In [None]:
f, ax = plt.subplots(figsize=(8, 4))
xs = np.linspace(-3, 3)
ax.plot(xs, scipy.stats.norm(0, 1).pdf(xs), lw=6);
annotate_normal_likelihood(ax); plt.tight_layout()

But the logarithm of the Normal distribution has an even simpler shape.

## Logarithms of probabilities are often more natural than probabilities.

Fundamentally, it is because probabilities interact through _multiplication_:
just look at Bayes' Rule.

But we naturally think in terms of _addition_.
For example space and time are additive -- distances and epochs
are measured in terms of differences, not ratios.


Logarithms _turn multiplication into addition_,
so they make some things in probability more antural.

Negative log probabilities are also called _surprises_
For more see
[this blog post](https://charlesfrye.github.io/stats/2017/11/09/the-surprise-game.html)
on expected surprise as a modeling criterion
or, for a more intuitive approach,
[this blog post](https://charlesfrye.github.io/stats/2016/03/29/info-theory-surprise-entropy.html)
on where surprise comes from or and how it relates to information).

In [None]:
def normal_log(x, mu, sigma=1):
    return np.log(scipy.stats.norm(mu, 1).pdf(x))

## For example, the logarithm of a Normal distribution is a parabola.

The peak of the parabola is at the mean.

In our particular case, the mean is the value we predict.

In [None]:
f, ax = plt.subplots(figsize=(10, 5))

xs = np.linspace(-3, 3)
normal_log_as_function_of_x = normal_log(xs, 0)
ax.plot(xs, normal_log_as_function_of_x, lw=6);
annotate_normal_log(ax)

Notice two things:
first, the value most probably observed is equal to the prediction,
and second, the lots of other values are also quite probable --
more so the closer they are to the prediction.

The "steepness" of the parabola and its height also depend on the standard deviation,
as we can see if we use the logarithm rules on the logarithm of the Normal distribution function.

$$
\log p (x\vert\mu,\sigma) = \underbrace{-\log\left(\sqrt{2\pi}\sigma\right)}_\text{uncertainty penalty} -
\underbrace{(x - \mu)^2 / 2\sigma^2}_\text{scaled squared error}\\
$$

Mathematical details below:
$$
\begin{align}
p(x\vert\mu,\sigma) &= \frac{1}{\sqrt{2\pi}\sigma} \mathrm{e}^{\frac{-(x-\mu^2)}{2\sigma^2}}\\
\log p (x\vert\mu,\sigma) &= \log \left(\frac{1}{\sqrt{2\pi}\sigma} \mathrm{e}^{\frac{-(x-\mu^2)}{2\sigma^2}}\right)\\
&= \log\left(\frac{1}{\sqrt{2\pi}\sigma}\right) + \log\left( \mathrm{e}^{\frac{-(x-\mu^2)}{2\sigma^2}}\right)\\
&= -\log\left(\sqrt{2\pi}\sigma\right) -(x - \mu)^2/2\sigma^2\\
&= \underbrace{-\log\left(\sqrt{2\pi}\sigma\right)}_\text{lower if spread increases}
\underbrace{-(x - \mu)^2}_{\text{higher if mean and value are close}}/ \underbrace{2\sigma^2}_\text{spread controls scale of errors}\\
&= \underbrace{-\log\left(\sqrt{2\pi}\sigma\right)}_\text{uncertainty penalty} -
\underbrace{(x - \mu)^2 / 2\sigma^2}_\text{scaled squared error}\\
\end{align}
$$

But in general, we can ignore the standard deviation,
since it only tells us.

As a function of the mean,
that is, as a _likelihood_,
the shape is also a parabola:

In [None]:
predictions = np.linspace(-3, 3)
observed_value = 0.5
normal_log_as_function_of_prediction =\
    [normal_log(observed_value, mu) for mu in predictions]

In [None]:
f, ax = plt.subplots(figsize=(10, 5))

ax.plot(predictions, normal_log_as_function_of_prediction, lw=6);
annotate_normal_ll(ax); plt.tight_layout()

Notice that the peak of the log-likelihood is located at the observed value.

## Therefore the Normal likelihood term in our regression model is telling us to minimize the squared error.

In a regression model, the mean is the prediction and the "height" of the parabola is the negative squared prediction error.

NB: maximizing the negative squared error is the same as minimizing the squared error.

In [None]:
def negative_squared_error(observed_value, predicted_value):
    return -(observed_value - predicted_value) ** 2

If we compute the negative squared error,
we get the same rough shape,
a parabola,
up to a y-axis shift and a change in the steepness.

And importantly, the maximum is still at the same spot: right at the observed value.

In [None]:
f, ax = plt.subplots(figsize=(10, 5))
errors = np.linspace(-3, 3)
ax.plot(errors, - (errors ** 2), lw=4);
ax.set_xlabel("$y-\mu$"); ax.set_ylabel("$-(y - \mu)^2$"); plt.tight_layout();

## In a real model, we have more than one observed value.

To determine the overall log-likelihood of a given choice of the parameters,
we have to look at their log-likelihood on each data point.

The overall log-likelihood is the sum of all of the individual log-likelihoods.

In [None]:
def compute_normal_ll(y, x, slope, intercept, sigma=1):
    
    # u = m * x + b
    predictions = slope * x + intercept
    
    # e = y - u
    errors = y - predictions
    
    # e / s
    scaled_errors = errors / (np.sqrt(2) * sigma)
    
    # (e / s) ^2
    scaled_squared_error = scaled_errors ** 2
    
    uncertainty_penalty = - np.log(np.sqrt(2 * np.pi) * sigma)
    
    return uncertainty_penalty - scaled_squared_error

First, let's look at the log-likelihood of a baseline model:
one that predicts that the individual's height is average, no matter their parents' heights.

In [None]:
mean_height = df["height"].mean()
compute_ll_baseline = lambda y, x: compute_normal_ll(y, x, 0, mean_height)

In [None]:
ax = datalikelihood_surface(compute_ll_baseline)

This plot is very dense and rewards close study and interaction
-- click and drag to change the camera view to see the various details.

My favorite view puts the x- and y- axes in approximately the same spot as they are in
the `jointplot`, but at a slight angle so that we can see the 3-D shape of the surface.

The x- and y-axes of this plot are the midparental height (the value we are using to predict)
and the child's height (the value we are trying to predict).

The z-axis is the Normal log-probability,
for the given choice of parameters,
for that combination of midparental and child heights.
To aid in reading the plot,
the log-probability is colored by its height on the z-axis.
Darker colors mean lower log-probability.

The observed data is plotted as well,
scattered "on top" of the log-probability.

Our analysis above suggests the following:
- the maximum value of this log-probability along the y-axis for a fixed x should be where the model predictions are.
- along the y-axis, the shape of the surface is a parabola

To confirm that you understand what is being visualized, make sure you can see those facets of the plot.

To calculate the total log-likelihood of the parameters,
we add up the log-probability of all of the data points:

In [None]:
sum(compute_ll_baseline(df["height"], df["midparental_height"]))

Let's compare that log-probability and total log-likelihood to those
of a better model.

These parameters are based on the posterior samples in the previous lecture.

Below, we'll see how to obtain good parameter values.

In [None]:
compute_ll_good = lambda y, x: compute_normal_ll(y, x, 0.7, 20, sigma=1)

In [None]:
ax = datalikelihood_surface(compute_ll_good)

Notice that for these parameters,
the brightly-colored region of the log-probability,
where the  values are high,
now overlaps with most of the dataset.

When we add up the total log-likelihood for this model, we get a much less negative value:

In [None]:
sum(compute_ll_good(df["height"], df["midparental_height"]))

The log-likelihood is lower for a worse model.

In [None]:
compute_ll_bad = lambda y, x: compute_normal_ll(y, x, -0.3, 80)

This model is predicting that the child's height should decrease when the parents' height increases,
as evidenced by the negative slope.

In [None]:
ax = datalikelihood_surface(compute_ll_bad)

For this setting of the parameters,
the high values of the log-probability do not overlap with the data as much,
and a few of the datapoints are located in very low-probability regions,
e.g. where the log-probability is close to -250.

The resulting total log-likelihood of the parameter values is much more negative:

In [None]:
sum(compute_ll_bad(df["height"], df["midparental_height"]))

## Changing the parameter to maximize the value of the log-likelihood is known as _maximum likelihood estimation_, or MLE.

Maximum likelihood estimation is the core technique for
- classical frequentist estimation
- much of contemporary machine learning

In the latter context,
the negative log-probability of the observed data according to the model
typically appears as part of the "loss function" or "cost".

This gives a probabilistic and Bayesian interpretation of these algorithms,
which can be handy in understanding them,
even when they aren't approached with the tools we use in this class.

If you take a machine learning, pattern recognition, or AI class
in which you "fit models to data"
by using gradient descent or something similar
to minimize a loss,
you are most likely doing MAP or MLE
without realizing it.

`pyMC` doesn't have a `find_MLE` function directly,
but as it turns out,
we can still do MLE.

## With the right choice of prior, we can turn MAP inference into maximum likelihood estimation.

Recall the posterior was defined only up to a proportionality constant:

$$
\color{green}{p(\text{slope}, \text{intercept}, \sigma \vert \text{data})}
\propto \color{darkgoldenrod}{p(\text{data} \vert \text{slope}, \text{intercept}, \sigma)}
\cdot \color{darkblue}{p(\text{slope}, \text{intercept}, \sigma)}
$$

If we choose a prior where
$p(\text{slope}, \text{intercept}, \sigma)$ is constant,
then we can simplify to

$$
\color{green}{p(\text{slope}, \text{intercept}, \sigma \vert \text{data})}
\propto \color{darkgoldenrod}{p(\text{data} \vert \text{slope}, \text{intercept}, \sigma)}
$$

That is, the more likely the data looks given the parameters,
the more likely it is that those parameters are correct.

So if we apply `find_MAP` to a model with a constant prior,
then the result will also maximize the likelihood.

For continuous variables, the constant priors are the `Flat` and `HalfFlat` priors.

In [None]:
with pm.Model() as ordinary_least_squares:
    Intercept = pm.Flat("Intercept")
    Slope = pm.Flat("Slope")
    
    Sigma = 1
    
    Heights = pm.Normal("Heights",
                       mu=Slope * df["midparental_height"] + Intercept,
                       sd=Sigma,
                       observed=df["height"])

This is called the "ordinary least squares" model because
- maximizing the likelihood means minimizing the squared error
- it's the "ordinary" or "typical" model for frequentists

When most folks say "linear regression", this is the model they are thinking of.

When they say "fit a linear regression model",
they mean to do something equivalent to the `find_MAP` call in the cell below.

In [None]:
OLS_MAP = pm.find_MAP(model=ordinary_least_squares)

MAP_slope, MAP_intercept = OLS_MAP["Slope"], OLS_MAP["Intercept"]
OLS_MAP

Notice the `logp` output in red:
for this model with `Flat` priors,
this is equal to the log-likelihood.

If we feed the correct parameters to `compute_normal_ll`,
we'll get the same value:

In [None]:
lls = compute_normal_ll(df["height"], df["midparental_height"], MAP_slope, MAP_intercept, sigma=1)
np.sum(lls)

We can check the value of the log-posterior probability at any time using a `Model`'s
`.logp` method,
which takes a dictionary of parameter values:

In [None]:
ordinary_least_squares.logp({"Intercept": MAP_intercept, "Slope": MAP_slope})

The results will be consistent for other choices of the `Slope` and `Intercept`.

In [None]:
print(ordinary_least_squares.logp({"Intercept": mean_height, "Slope": 0}))
sum(compute_normal_ll(df["height"], df["midparental_height"], 0, mean_height, sigma=1))

The next visualization is a 2-D version of the information in the 3-D plot above,
now with the predictions for a given setting of the parameters added.

In [None]:
slope, intercept = MAP_slope, MAP_intercept
## check randomly-chosen parameters close to the MAP
# slope, intercept = perturb_parameters(MAP_slope, MAP_intercept)
# check "baseline" model that always predicts mean
slope, intercept = 0, mean_height

datalikelihood_scatter(df["height"], df["midparental_height"],
                    intercept=intercept,
                    slope=slope)

The predictions appear as a black line over a colored scatter plot of the observed data.

Once again, the log-likelihood values correspond to color:
brightly-colored datapoints contribute a small negative number to the log-likelihood,
while more darkly-colored datapoints contribute larger negative numbers.

The total log-likelihood is printed at the top of the chart.

You're encouraged to try uncommenting some of the commented lines above to see
how the predictions and log-likelihood relate for other choices of the parameters
and to try other values and see if you can predict what you will see.

# It is typical to standardize data and measures of performance.

Log-probabilities are nice, but the numbers are fairly hard to interpret.

Is -6000 good or bad?

For this model,
it is good relative to the other parameter values we checked,
but it's unclear whether it means our model is highly accurate or not.

## We'd like a number for our performance that varies between 0 and 1.

Where $0$ means "as bad as a strawman" and $1$ means "the best possible".

### We start by defining a baseline value for the parameters.

In [None]:
baseline_predictions = predict_height(df["midparental_height"],
                                      0, mean_height)

This is meant to be something whose performance we can exceed.

### Then, we compute the errors and mean squared error for those parameters.

In [None]:
baseline_errors = baseline_predictions - df["height"]

In [None]:
(baseline_errors ** 2).mean()

In [None]:
def prediction_MSE(predictions, observed_values):
    return ((predictions - observed_values) ** 2).mean()

### And do the same for the parameters we want to evaluate.

In [None]:
predictions = predict_height(df["midparental_height"],
                             MAP_slope, MAP_intercept)
errors = predictions - df["height"]

In [None]:
prediction_MSE(predictions, df["height"])

### Then we take a ratio:

In [None]:
prediction_MSE(predictions, df["height"]) / prediction_MSE(baseline_predictions, df["height"])

In [None]:
def ratio_of_errors(predictions, baseline_predictions, observed_values):
    baseline_MSE = prediction_MSE(baseline_predictions, observed_values)
    actual_MSE = prediction_MSE(predictions, observed_values)
    
    return actual_MSE / baseline_MSE

In [None]:
ratio_of_errors(predictions, baseline_predictions, df["height"])

### If our predictions were perfect, we'd get a ratio of 0:

In [None]:
ratio_of_errors(df["height"], baseline_predictions, df["height"])

### If our predictions were no better than baseline, we'd get a ratio of 1:

In [None]:
ratio_of_errors(baseline_predictions, baseline_predictions, df["height"])

### Almost there: if we subtract from 1, then 0 means baseline performance and 1 means perfect performance.

In [None]:
1 - ratio_of_errors(predictions, baseline_predictions, df["height"])

## This value is known as the _variance explained_.

It is one way of capturing the fraction of the uncertainty in the data
that was "explained away" by the model.

Unlike the log-probability, it's directly tied to the Normal likelihood,
since we used mean squared error.

In [None]:
small_sample = df.sample(n=10)

In [None]:
make_error_plot(0, mean_height, small_sample)

In [None]:
make_error_plot(MAP_slope, MAP_intercept, small_sample)

If our errors are zero on average, then the ratio of mean squared errors is equal to the ratio of variances:

In [None]:
print(np.var(errors), np.var(baseline_errors))

In [None]:
print(np.var(errors) / np.var(baseline_errors), 1 - np.var(errors) / np.var(baseline_errors))

### The square root of this value is known as the _correlation_, aka _Pearson's $r$_.

And hence the value is sometimes called $R^2$.

We can calculate it directly ourselves

In [None]:
np.sqrt(1 - ratio_of_errors(predictions, baseline_predictions, df["height"]))

or use `scipy.stats`, which calls it `pearsonr`

In [None]:
scipy.stats.pearsonr(df["height"], df["midparental_height"])[0]

or use `numpy`, which calls it `corr`elation `coef`ficient:

In [None]:
np.corrcoef(df["height"], df["midparental_height"])[0, 1]

### If the variance explained by a linear model is not 0, we say the two variables are _correlated_.

## It is also common to standardize data before performing any modeling.

Is a midparental height of 72 large or small?

We know that it's quite above average because of our experience with the data,
but without that context, it's unclear.

If we subtract off the mean,
then we know that values above 0 are bigger than average
and below 0 are smaller than average.

If we divide by the standard deviation,
then most values will be between ±2.

In [None]:
def standardize(data):
    return (data - data.mean()) / data.std()

standardized_heights = standardize(df["height"])

standardized_heights.mean(), standardized_heights.std()

In [None]:
pm.stats.hpd(standardized_heights)

In [None]:
f, ax = plt.subplots(figsize=(8, 4))
sns.distplot(standardized_heights);

This is also known as _$z$-scoring_.

Once data is $z$-scored,
we can immediately tell whether a value is close to average: it will be close to 0.
And we also know the "scale" of our data: typical values should be within ±2.

## Standardization doesn't fundamentally change the relationships between variables.

It's equivalent to a change of units,
like going from measuring heights in centimeters to inches
or switching from measuring temperatures in Fahrenheit to Celsius.

We can see this in the `jointplot` of the data:

In [None]:
standardized_midparental_heights = standardize(df["midparental_height"])

sns.jointplot(standardized_midparental_heights, standardized_heights);

If you compare this plot to the original `jointplot`,
you'll see that the "shapes" haven't changed, only the values on the axes.

## Standardization doesn't affect the correlation.

In [None]:
correlation = scipy.stats.pearsonr(standardized_heights, standardized_midparental_heights)[0]
correlation

## The correlation is also equal to the slope of the MLE regression line for standardized data.

We can define a pyMC model for standardized data with minimal changes:

In [None]:
with pm.Model() as standardized_OLS:
    Slope = pm.Flat("Slope")
    Intercept = 0  # consider: why is this value 0?
    
    ObservedValues = pm.Normal("ObservedValues",
                               mu=Slope * standardized_midparental_heights + Intercept,
                               sd=1,
                               observed=standardized_heights)

The `Intercept` variable is included only to make the connection between the two models clear.

In [None]:
standardized_trace = shared_util.sample_from(standardized_OLS)
standardized_posterior_df = shared_util.samples_to_dataframe(standardized_trace)

In [None]:
pm.plot_posterior(standardized_trace, ref_val=correlation);

The interpretation of this slope is as follows:

> for every increment of the midparental height by one standard deviation, the expected height for their children increased by about a third of a standard deviation 

Notice that the correlation is close to the center of the posterior.

If we use `find_MAP`, the match is even better:

In [None]:
standardized_OLS_MAP = pm.find_MAP(model=standardized_OLS)
standardized_OLS_MAP, correlation

Notice that the `logp` is very different, even though the $R^2$ is the same.

This is a major issue with using `logp` to compare models:
it is very sensitive to irrelevant details.

We can also calculate the correlation coefficient "by hand" and obtain the same answer.

In [None]:
predictions_standardized = predict_height(
    standardized_midparental_heights, standardized_OLS_MAP["Slope"], 0)
baseline_predictions_standardized = predict_height(
    standardized_midparental_heights, 0, 0)

np.sqrt(1 - ratio_of_errors(predictions_standardized, baseline_predictions_standardized , standardized_heights))

# How do we know when we can ignore the linear relationship between two variables?

For example,
Galton's original explanation of his findings presumed that individuals married
without respect to height:
if not, then the predicted heights of grandchildren might not exhibit "regression to mediocrity".

Let's check the data for whether that was a reasonable assumption to make:

In [None]:
standardized_maternal_heights = standardize(df["mother"])
standardized_paternal_heights = standardize(df["father"])

with pm.Model() as smaller_effect_model:
    Slope = pm.Flat("Slope")
    ObservedValues = pm.Normal("MaternalHeights",
                               mu=Slope * standardized_maternal_heights,
                               sd=1,
                               observed=standardized_paternal_heights)

In [None]:
smaller_effect_trace = shared_util.sample_from(smaller_effect_model)
smaller_effect_posterior_df = shared_util.samples_to_dataframe(smaller_effect_trace)

In [None]:
pm.plot_posterior(smaller_effect_trace, ref_val=0);

It seems that the 95% posterior density does not include 0,
so by the method we've been using so far in the class,
we'd have to conclude that there's a flaw in Galton's analysis.

## Standardization also lets us consider the _magnitude_ of a relationship in standard terms.

The posterior above indicates that we should change our expectation of a mother's height
by a bit under a tenth of a standard deviation every time the father gets taller or shorter by a standard deviation.

This is a _very small_ effect.

For a father who is three standard deviations away from average height,
aka someone who is six feet, two inches,
we predict a maternal height of less than an inch above average.

## Enter the ROPE: Region of Practical Equivalence

Before running our analysis, we define a set of values, close to 0,
which we consider to be _practically equivalent to 0_.

#### We call this the Region Of Practical Equivalence.

Once we obtain a posterior, we can check the overlap between our posterior and this region:

For a big effect, this overlap will be small or 0:

In [None]:
pm.plot_posterior(standardized_trace, ref_val=0, rope=(-0.05, 0.05));

I selected as my ROPE here -0.05 to 0.05:
a correlation is _effectively_ 0 if it suggests
that I only need to change my prediction by a factor of 1 in 20 or less
when I take into account the independent variable.

For a small effect, this overlap will be larger:

In [None]:
pm.plot_posterior(smaller_effect_trace, ref_val=0, rope=(-0.05, 0.05));

As usual, we test the probability we assign to the statement
"the correlation is within the region of practical equivalence"
by checking whether it is true on our posterior.

In [None]:
def is_in_ROPE(sample, ROPE=(-0.05, 0.05)):
    return ROPE[0] < sample < ROPE[1]

In [None]:
smaller_effect_posterior_df["Slope"].apply(is_in_ROPE).mean()

There is a fairly decent chance, around 25%,
that the correlation between maternal and paternal heights is negligible,
according to this definition of the ROPE.

# In addition to a likelihood, the definition of a regression model includes a prior over the parameters.

So far today, we've focused on _flat_ priors,
which are technically not probability distributions,
so that we could connect to more mainstream MLE methods.

Unlike in tests for differences of means,
it's actually quite common for regression models to include
Bayesian elements, even if they aren't always thought of that way.

### For example, a common technique called _ridge regression_ is equivalent to placing a Normal prior on the slope.

In [None]:
with pm.Model() as ridge_regression_model:
    # ridge regresion <> Normal prior on slope
    Slope = pm.Normal("Slope", mu=0, sd=2.5e-2)
    # This prior says: I think it is very likely that
    #  the correlation is between -5e-2 and 3e-2 (and so inside the ROPE)
    
    
    ObservedValues = pm.Normal("ObservedValues",
                               mu=Slope * standardized_maternal_heights,
                               sd=1,
                               observed=standardized_paternal_heights)

In [None]:
ridge_trace = shared_util.sample_from(ridge_regression_model)

In [None]:
pm.plot_posterior(ridge_trace, rope=(-0.05, 0.05));

The overlap between the posterior and the ROPE is now much stronger.

But the MAP value is still not 0.

See the following section, on LASSO regression, for an indication of how to set a prior that gives MAP estimates that are exactly 0.

In [None]:
pm.find_MAP(model=ridge_regression_model)

### With a strong prior that there is no relationship, strong evidence is required to conclude a relationship is present.

In [None]:
with pm.Model() as ridge_regression_model:
    Slope = pm.Normal("Slope", mu=0, sd=2.5e-2)
    # This prior says: I think it is extremely likely that
    #  the correlation is between -5e-2 and 5e-2 (and so inside the ROPE)
    
    
    ObservedValues = pm.Normal("ObservedValues",
                               mu=Slope * standardized_midparental_heights,
                               sd=1,
                               observed=standardized_heights)

In [None]:
ridge_trace = shared_util.sample_from(ridge_regression_model)

In [None]:
pm.plot_posterior(ridge_trace, rope=(-0.05, 0.05));

In [None]:
pm.find_MAP(model=ridge_regression_model)

### Another technique, called LASSO regression, is used to obtain MAP estimates that are exactly 0.

LASSO is equivalent to placing a [_Laplace_](https://en.wikipedia.org/wiki/Laplace_distribution)
prior on the slope:

In [None]:
f, ax = plt.subplots(figsize=(8, 4))
xs = np.linspace(-3, 3, num=1000)
ax.plot(xs, np.exp(pm.Laplace.dist(mu=0, b=0.1).logp(xs).eval()), lw=4);
ax.plot(xs, np.exp(pm.Laplace.dist(mu=0, b=1).logp(xs).eval()), lw=4);

In [None]:
with pm.Model() as lasso_regression_model:
    # lasso regression <> Laplace prior on slope
    Slope = pm.Laplace("Slope", mu=0, b=0.01)
    ObservedValues = pm.Normal("ObservedValues",
                               mu=Slope * standardized_maternal_heights,
                               sd=1,
                               observed=standardized_paternal_heights)

In [None]:
lasso_trace = shared_util.sample_from(lasso_regression_model, target_accept=0.9)

In [None]:
pm.plot_posterior(lasso_trace, rope=(-0.05, 0.05));

In [None]:
pm.find_MAP(model=lasso_regression_model)

Try with the standardized height/midparental height data, and you'll see that the MAP estimate is not 0.

Using either of the priors specified,
we can verify Galton's assumption
that the maternal and paternal heights are negligibly correlated.

# If our data is not `Normal`, we can use a different likelihood.

One of the most common causes of non-Normality is _outliers_:
large, rare effects.

For example, in the `mpg` dataset that you might work with in lab on Friday,
there's at least one incorrect data entry.

Recall how quickly the log-likelihood for `Normal` data dropped:
when a data point is far away from the prediction, the log-likelihood suffers immensely.

### A technique for doing accurate regression in the presence of outliers is called _robust regression_.

### In a Bayesian model, robust regressions correspond to different choices of likelihood.

Specifically, choices of probability distributions with "heavy tails",
like the `Cauchy`, the `StudentT`, or the `Laplace`.

In [None]:
with pm.Model() as robust_linear_regression:
    Slope = pm.Flat("Slope")
    Intercept = 0
    Beta = 1  # equivalent to Sigma in a Normal
    
    ObservedValues = pm.Cauchy("ObservedValues",
                               alpha=Slope * standardized_midparental_heights + Intercept,
                               beta=1,
                               observed=standardized_heights)

In [None]:
robust_trace = shared_util.sample_from(robust_linear_regression)

Because our data doesn't suffer as much from outliers, the results are fairly similar:

In [None]:
pm.plot_posterior(robust_trace);

#### Bayesian methods allow our data to tell us whether we need to use a robust method.

The `StudentT` distribution has a parameter, `nu`, that determines whether it has heavy tails.

When `nu` is close to 1, the `StudentT` has very heavy tails;
when `nu` is large, say above 20, the `StudentT` looks more like a `Normal`,
and no longer has a heavy tail.

If we make this parameter part of our model,
we can get a posterior over how necessary robust regression is
for our data.

In [None]:
with pm.Model() as optionally_robust_linear_regression:
    Slope = pm.Flat("Slope")
    Intercept = 0
    # "Degrees of Freedom": ~30 means data is normal-ish,
    #   under 10 means data has outliers
    Nu = pm.DiscreteUniform("Nu", lower=1, upper=30)
    Sigma = pm.Exponential("Sigma", lam=1)
    
    ObservedValues = pm.StudentT("ObservedValues",
                                 mu=Slope * standardized_midparental_heights,
                                 sd=Sigma,
                                 nu=Nu,
                                 observed=standardized_heights)

In [None]:
optionally_robust_trace = shared_util.sample_from(optionally_robust_linear_regression)

In [None]:
pm.plot_posterior(optionally_robust_trace, figsize=(8, 4), ref_val=[0, 1, 1]);

The fact that the posterior for `Nu` is shifted to the right, i.e. close to 30,
indicates that there are not substantial outliers in our data.