In [None]:
import statsmodels
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy
import os

import statsmodels.api as sm

This cell has $\LaTeX$ commands.
$$
\newcommand{\E}{\text{E}}
\newcommand{\var}{\text{Var}}
$$

In the regression model,
$$
Y_i = \beta_0 + \beta_1 X_i + \epsilon_i,
$$
the terms $\beta_0 + \beta_1 X_i$ form the systematic part and $\epsilon_i$ is called noise. $\epsilon_i$ are assumed to be iid $N(0, \sigma^2)$. Therefore,
$$
\E(Y_i) = \beta_0 + \beta_1\E(X_i) + 0
$$
and
$$
\var(Y_i\;|\;X_i = x_i) = 0 + \beta_1\times 0 + \sigma^2 = \sigma^2.
$$
Thus, $Y_i\;|\; X_i = x_i \sim N(\beta_0 + \beta_1 x_i, \sigma^2)$. It can be shown that under the assumption of gaussian noise, the maximum likelihood estimates of $\beta_0$ and $\beta_1$ are the least square estimates
$$
\begin{eqnarray}
\hat{\beta}_1 &=& S_{xy}\frac{S_y}{S_x} \\
\hat{\beta}_0 &=& \bar{Y} - \hat{\beta}_1\bar{X}
\end{eqnarray}
For a given data, the regression analysis gives an estimates of the true parameters of the regression model.

Although the intercept is of mathematical interest it may not be important for the problem at hand.  That could be the reason by `statsmodel.api.OLS` does not add an intercept by default.

In [None]:
dataset = 'diamond.csv'
filepath = os.path.join('.', 'datasets', dataset)

if os.path.isfile(filepath):
    df: pd.core.frame.DataFrame = pd.read_csv(filepath)
else:
    assert False

In [None]:
df.head()

In [None]:
model_1 = sm.OLS(endog=df["price"], exog=sm.add_constant(df["carat"]))
result_1 = model_1.fit()
result_1.summary()

For this data, the estimate of the intercept is $\hat{\beta}_0 = -259.63$. It is interpreted as the price of a zero carat diamond. This statement, althought correct from the perspective of the regression model, is meaningless to a diamond trader. To avoid problems likes these, one shift the origin to the point $(\bar{X}, 0)$ and interprets the intercept at the price of an 'average' diamond. We build such a model in the next cell.

# Linear model with X centred at its mean

In [None]:
model_2 = sm.OLS(
    endog=df["price"], exog=sm.add_constant(df["carat"] - df["carat"].mean())
)
result_2 = model_2.fit()
result_2.summary()

# Predict prices for new diamonds

In [None]:
newX = [0.16, 0.27, 0.34]
newX = newX - df["carat"].mean()
new_data = pd.DataFrame({"carat": newX})
result_2.predict(sm.add_constant(new_data))

In [None]:
plt.scatter(x=df["carat"], y=df["price"])
plt.plot(
    df["carat"], result_2.predict(), color="black", linestyle="--", label="regression"
)
plt.xlabel("carat")
plt.ylabel("price")
_ = plt.legend()

Refer to [an answer on Stackexchange](https://stats.stackexchange.com/questions/16493/difference-between-confidence-intervals-and-prediction-intervals) to understand the difference between confidence interval and prediction interval.

# Confidence and prediction intervals

In [None]:
prediction_2: statsmodels.regression._prediction.PredictionResults = (
    result_2.get_prediction(sm.add_constant(np.array([0.16, 0.27, 0.34])))
)
prediction_2.conf_int(alpha=0.05)

In [None]:
prediction_2.summary_frame()

We will draw the plot of raw data and the regression line again now with the confidence intervals as well.

In [None]:
prediction_2_insample = result_2.get_prediction(
    sm.add_constant(df["carat"] - df["carat"].mean())
).summary_frame()
reg_line = prediction_2_insample["mean"]
upp_line_ci = prediction_2_insample["mean_ci_upper"]
low_line_ci = prediction_2_insample["mean_ci_lower"]
upp_line_pi = prediction_2_insample["obs_ci_upper"]
low_line_pi = prediction_2_insample["obs_ci_lower"]
plt.scatter(x=df["carat"], y=df["price"])
plt.plot(df["carat"], reg_line, color="black", linestyle="--", label="regression")
plt.plot(
    df["carat"], upp_line_ci, color="red", linestyle="dotted", label="Upper conf. level"
)
plt.plot(
    df["carat"], low_line_ci, color="red", linestyle="dotted", label="Lower conf. level"
)
plt.plot(
    df["carat"],
    upp_line_pi,
    color="green",
    linestyle="dotted",
    label="Upper pred. level",
)
plt.plot(
    df["carat"],
    low_line_pi,
    color="green",
    linestyle="dotted",
    label="Lower pred. level",
)

plt.xlabel("carat")
plt.ylabel("price")
_ = plt.legend()