Based on https://bambinos.github.io/bambi/notebooks/orthogonal_polynomial_reg.html

In [None]:
import arviz as az
import bambi as bmb
import formulae
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import seaborn as sns
from typing import Optional

plt.style.use("arviz-darkgrid")
SEED = 1234
np.random.seed(SEED)

In [None]:
# Temporary fix to make outputs cleaner
import warnings

warnings.filterwarnings("ignore")

In [None]:
df_mpg = sns.load_dataset("mpg")
df_mpg.head()

In [None]:
df_mpg = df_mpg.dropna(subset=["horsepower", "mpg"])


plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
sns.regplot(data=df_mpg, x="horsepower", y="mpg", line_kws={"color": "firebrick"})

plt.subplot(1, 2, 2)
sns.histplot(df_mpg["mpg"], edgecolor="black", kde=True)
plt.xlabel("MPG")
plt.ylabel("Count")
plt.title("Histogram of MPG")

plt.tight_layout()
plt.show()

In [None]:
mpg_hp_linear_mod = bmb.Model("mpg ~ horsepower", df_mpg)
mpg_hp_linear_fit = mpg_hp_linear_mod.fit(
    idata_kwargs={"log_likelihood": True}, random_seed=SEED
)
mpg_hp_linear_mod.predict(mpg_hp_linear_fit, kind="response")

In [None]:
fig = plt.figure()
for p in [0.68, 0.95]:
    bmb.interpret.plot_predictions(
        mpg_hp_linear_mod,
        mpg_hp_linear_fit,
        "horsepower",
        pps=True,
        legend=True,
        prob=p,
        ax=plt.gca(),
    )
sns.scatterplot(data=df_mpg, x="horsepower", y="mpg", color="blue", label="True Data");

In [None]:
predicted_mpg = mpg_hp_linear_fit.posterior["mu"].mean(("chain", "draw"))
residuals = df_mpg["mpg"] - predicted_mpg
sns.scatterplot(data=df_mpg, x="horsepower", y=residuals)
plt.axhline(0, color="black", lw=2)
plt.ylabel("Residuals")
plt.title("Residuals for linear model")

In [None]:
mpg_hp_sq_mod = bmb.Model("mpg ~ poly(horsepower, 2)", df_mpg)
mpg_hp_sq_fit = mpg_hp_sq_mod.fit(
    idata_kwargs={"log_likelihood": True}, random_seed=SEED
)
mpg_hp_sq_mod.predict(mpg_hp_sq_fit, kind="response")

In [None]:
fig = plt.figure()
for p in [0.68, 0.95]:
    bmb.interpret.plot_predictions(
        mpg_hp_sq_mod,
        mpg_hp_sq_fit,
        "horsepower",
        pps=True,
        legend=True,
        prob=p,
        ax=plt.gca(),
    )
sns.scatterplot(data=df_mpg, x="horsepower", y="mpg", color="blue", label="True Data")
plt.title("Quadratic Fit")

In [None]:
predicted_mpg = mpg_hp_sq_fit.posterior["mu"].mean(("chain", "draw"))
residuals = df_mpg["mpg"] - predicted_mpg
sns.scatterplot(data=df_mpg, x="horsepower", y=residuals)
plt.axhline(0, color="black", lw=2)
plt.ylabel("Residuals")
plt.title("Residuals for quadratic model")

In [None]:
az.compare({"Linear": mpg_hp_linear_fit, "Quadratic": mpg_hp_sq_fit})

In [None]:
poly_fits, poly_models = {}, {}
for degree in range(1, 10):
    model = bmb.Model(f"mpg ~ poly(horsepower, {degree})", df_mpg)
    fit = model.fit(
        idata_kwargs={"log_likelihood": True}, random_seed=SEED, progressbar=False
    )
    poly_models[f"Poly{degree}"] = model
    poly_fits[f"Poly{degree}"] = fit

cmp = az.compare(poly_fits)
cmp

In [None]:
ax = az.plot_compare(cmp, figsize=(12, 4), plot_ic_diff=False, legend=False)
best_loo = cmp["elpd_loo"].iloc[0]
ax.axvspan(best_loo - 4, best_loo, color="C0", alpha=0.2);

In [None]:
best_model = poly_models["Poly7"]
best_fit = poly_fits["Poly7"]
best_model.predict(best_fit, kind="response")

predicted_mpg = best_fit.posterior["mu"].mean(("chain", "draw"))
residuals = df_mpg["mpg"] - predicted_mpg
sns.scatterplot(data=df_mpg, x="horsepower", y=residuals)
plt.axhline(0, color="black", lw=2)
plt.ylabel("Residuals")
plt.title("Residuals for degree 7 model");

In [None]:
fig = plt.figure()
for p in [0.68, 0.95]:
    bmb.interpret.plot_predictions(
        best_model, best_fit, "horsepower", pps=True, legend=True, prob=p, ax=plt.gca()
    )
sns.scatterplot(data=df_mpg, x="horsepower", y="mpg", color="blue", label="True Data")
plt.title("Best Fit Model: 7th Degree Polynomial");

In [None]:
extrapolate_x_hp = np.linspace(0, 500, 250)
mpg_hp_sq_mod.predict(
    mpg_hp_sq_fit, data=pd.DataFrame({"horsepower": extrapolate_x_hp})
)

sns.scatterplot(data=df_mpg, x="horsepower", y="mpg", color="blue", label="True Data")
plt.plot(
    extrapolate_x_hp,
    mpg_hp_sq_fit.posterior["mu"].mean(("chain", "draw")),
    color="red",
    label="Extrapolated Fit",
)
plt.xlim(left=0, right=extrapolate_x_hp.max())

plt.legend(frameon=False)

In [None]:
mpg_hp_linear_mod.predict(
    mpg_hp_linear_fit, data=pd.DataFrame({"horsepower": extrapolate_x_hp})
)
sns.scatterplot(data=df_mpg, x="horsepower", y="mpg", color="blue", label="True Data")

plt.plot(
    extrapolate_x_hp,
    mpg_hp_linear_fit.posterior["mu"].mean(("chain", "draw")),
    color="red",
    label="Predicted",
)
plt.fill_between(
    extrapolate_x_hp,
    plt.ylim()[0],
    0,
    color="grey",
    alpha=0.5,
    label="MPG Forbidden region",
)
plt.xlim(left=0, right=extrapolate_x_hp.max())
plt.ylim(bottom=mpg_hp_linear_fit.posterior["mu"].mean(("chain", "draw")).min())
plt.legend(frameon=False);

In [None]:
extrapolate_x_hp = np.linspace(0, 300, 250)
best_model.predict(best_fit, data=pd.DataFrame({"horsepower": extrapolate_x_hp}))

sns.scatterplot(data=df_mpg, x="horsepower", y="mpg", color="blue", label="True Data")
plt.plot(
    extrapolate_x_hp,
    best_fit.posterior["mu"].mean(("chain", "draw")),
    color="red",
    label="Extrapolated Fit",
)
plt.fill_between(
    extrapolate_x_hp,
    plt.ylim()[0],
    0,
    color="grey",
    alpha=0.5,
    label="MPG Forbidden region",
)

plt.xlim(left=0, right=extrapolate_x_hp.max())
plt.ylim(bottom=best_fit.posterior["mu"].mean(("chain", "draw")).min())
plt.legend(frameon=False);