# Linear regression

In [1]:
# Notebook setup

# Imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn import linear_model, metrics
from statsmodels.graphics.gofplots import qqplot

# Set the random seed for reproducability
np.random.seed(1337)

### Problem setting

Given a data set $\{y_i, x_{i1}, x_{i2}, ..., x_{ip}\}_{i=1}^n$ a linear regression model assumes that the relationship between the dependent variable $y$ and the vector of regressors $\bold{x}$ is linear.

This relationship is modeled through a disturbance term $\epsilon$, an unobserved random variable that adds "noise" to the linear relationship between the dependent variable and regressors.

The model takes the following form:

$y_i = \beta_0 + \beta_1 x_{i1} + ... + \beta_p x_{ip} + \epsilon_i = \bold x_i^T \bold\beta + \epsilon_i$ where $(i=1,..., n)$

Using matrix notation this can be rewrtitten the following way:

$\bold{y = X \beta + \epsilon}$

Where:

$\bold y = \begin{bmatrix} y_1 \\ y_2 \\ \vdots \\ y_n \end{bmatrix}$ are the observed values AKA the endogenous variable,

$\bold X = \begin{bmatrix} 1 & x_{11} & ... & x_{1p} \\ 1 & x_{21} & ... & x_{2p} \\ \vdots & \vdots & & \vdots \\ 1 & x_{n1} & ... & x_{np} \end{bmatrix}$ are the regressors AKA exogenous variables,

$\bold \beta = \begin{bmatrix} \beta_0 \\ \beta_1 \\ \beta_2 \\ \vdots \\ \beta_p \end{bmatrix}$ is the parameter vector, AKA the regression coefficients where $\beta_0$ is the intercept term,

$\bold \epsilon = \begin{bmatrix} \epsilon_1 \\ \epsilon_2 \\ \vdots \\ \epsilon_n \end{bmatrix}$ is the error term AKA noise or disturbance term.

Linear regerssion modelling is possible if the following assumptions hold true for the data:

1. The predictor variables can be treated as error free
2. The the endogenous variable is a linear combination of the exogenous variables and the coefficients
3. The variance of the errors does not depend on the exogenous variables and can be treated as constant throughout the data set
4. The errors are independent of each other
5. There must not be a perfect multicollinearity between the columns of the exogenous variables

### Evaluation

A list of metrics that can be used to evaluate the regression:

Residual sum of squares: $RSS = \sum \limits_{i=1}^{n} ( \hat y_i - \bar y )^2$

Total sum of squares: $TSS = \sum \limits_{i=1}^{n} ( y_i - \bar y )^2$

Explained sum of squares: $ESS = TSS - RSS$

$R^2$ score: $R^2 = 1 - \frac{RSS}{TSS}$

Mean absolute error: $MAE = \frac{1}{n} \sum \limits_{i=1}^{n} | \hat y_i - y_i |$

Mean squared error: $MSE = \frac{1}{n} \sum \limits_{i=1}^{n} | \hat y_i - y_i | ^ 2$

Explained variance score: $1 - \frac{Var(y - \hat y)}{Var(y)}$


### A simple example

In [2]:
# Generate a simple data set where y ~= 2 + 0.5x + e

# Generate a sample
n = 50
X = pd.DataFrame({"x1": np.random.randn(n)})
b = [2, 0.5]
y_noerror = b[0] + pd.Series(np.dot(X, b[1:]))
y = pd.Series(y_noerror + 0.05 * np.random.randn(n))

# # Plot the sample data
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=X.x1,
        y=y,
        name="Data sample",
        mode="markers",
        marker=dict(color="blue", size=6, symbol="circle"),
    )
)
fig.add_trace(
    go.Scatter(
        x=X.x1,
        y=y_noerror,
        name="True relationsip",
        mode="lines",
        line=dict(color="blue", width=1, dash="solid"),
    )
)
fig.update_traces(hovertemplate="x: %{x:0.3f}" + "<br>y: %{y:0.3f}")
fig.update_layout(
    template="plotly_white",
    width=800,
    height=700,
    title="A simple data set for linear regression with y = 2 + 0.5x1 + e",
    legend_title="Legend",
    showlegend=True,
)
fig.update_xaxes(title="x1: exogenous variable", range=None, dtick=None)
fig.update_yaxes(title="y: endogenous variable", range=None, dtick=None)
fig.show()

In [3]:
# Simple linear regression form Sklearn and calculate performance metrics

# Fit a simple linear model and get the parameters
model = linear_model.LinearRegression()
reg = model.fit(X, y)
y_pred = model.predict(X)
b0_pred = reg.intercept_
b1_pred = reg.coef_[0]

estimate = pd.DataFrame(
    {
        "estimate_x": np.linspace(X.x1.min(), X.x1.max(), n),
        "estimate_y": b0_pred + b1_pred * np.linspace(X.x1.min(), X.x1.max(), n),
    }
)

# Calculate performance metrics
tss = np.dot(y - y.mean(), y - y.mean())
rss = np.dot(y_pred - y.mean(), y_pred - y.mean())
ess = tss - rss

mae = metrics.mean_absolute_error(y, y_pred)
mse = metrics.mean_squared_error(y, y_pred)
r2 = metrics.r2_score(y, y_pred)
exp_var = metrics.explained_variance_score(y, y_pred)

print(f"Estimated beta_0 (intercept):               {b0_pred:0.4}")
print(f"Estimated beta_1 (coefficient for x1):      {b1_pred:0.4}")

print(f"TSS:                                        {tss:0.4}")
print(f"RSS:                                        {rss:0.4}")
print(f"ESS:                                        {ess:0.4}")
print(
    f"R^2 calculated from RSS and TSS:            {(1 - (rss/tss)):0.4} (note that the R^2 score below is 1-R^2)"
)

print(f"MAE:                                        {mae:0.4}")
print(f"MSE:                                        {mse:0.4}")
print(f"R^2 score:                                  {r2:0.4}")
print(f"Explained variance score:                   {exp_var:0.4}")

# Plot the estimated relationship
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=X.x1,
        y=y,
        name="Data sample",
        mode="markers",
        marker=dict(color="blue", size=6, symbol="circle"),
    )
)
fig.add_trace(
    go.Scatter(
        x=X.x1,
        y=y_noerror,
        name="True relationsip",
        mode="lines",
        line=dict(color="blue", width=1, dash="solid"),
    )
)
fig.add_trace(
    go.Scatter(
        x=estimate.estimate_x,
        y=estimate.estimate_y,
        name="Estimated relationsip",
        mode="lines",
        line=dict(color="red", width=1, dash="solid"),
    )
)
fig.update_traces(hovertemplate="x: %{x:0.3f}" + "<br>y: %{y:0.3f}")
fig.update_layout(
    template="plotly_white",
    width=800,
    height=700,
    title="A simple data set for linear regression with y = 2 + 0.5x1 + e",
    legend_title="Legend",
    showlegend=True,
)
fig.update_xaxes(title="x1: exogenous variable", range=None, dtick=None)
fig.update_yaxes(title="y: endogenous variable", range=None, dtick=None)
fig.show()

# Residual plot
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=X.x1,
        y=y_pred - y,
        name="Residuals",
        mode="markers",
        marker=dict(color="red", size=6, symbol="circle"),
    )
)
fig.update_traces(hovertemplate="x: %{x:0.3f}" + "<br>y: %{y:0.3f}")
fig.update_layout(
    template="plotly_white",
    width=800,
    height=700,
    title="Residual plot",
    legend_title="Legend",
    showlegend=True,
)
fig.update_xaxes(title="x1: exogenous variable", range=None, dtick=None)
fig.update_yaxes(title="Residuals", range=None, dtick=None)
fig.show()


# QQ plot with some help from the statsmodels library
qqplot_data = qqplot(y_pred, line="s", ax=None).gca().lines
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=qqplot_data[0].get_xdata(),
        y=qqplot_data[0].get_ydata(),
        name="Quantiles",
        mode="markers",
        marker=dict(color="black", size=6, symbol="circle"),
    )
)
fig.add_trace(
    go.Scatter(
        x=qqplot_data[1].get_xdata(),
        y=qqplot_data[1].get_ydata(),
        name="Quantile equality",
        mode="lines",
        line=dict(color="grey", width=1, dash="solid"),
    )
)
fig.update_traces(hovertemplate="x: %{x:0.3f}" + "<br>y: %{y:0.3f}")
fig.update_layout(
    template="plotly_white",
    width=800,
    height=700,
    title="QQ plot of the data sample",
    legend_title="Legend",
    showlegend=True,
)
fig.update_xaxes(title="Theoretical quantile", range=None, dtick=None)
fig.update_yaxes(title="Sample quantile", range=None, dtick=None)
fig.show()

plt.close()

Estimated beta_0 (intercept):               1.991
Estimated beta_1 (coefficient for x1):      0.5042
TSS:                                        14.19
RSS:                                        14.08
ESS:                                        0.1162
R^2 calculated from RSS and TSS:            0.008186 (note that the R^2 score below is 1-R^2)
MAE:                                        0.03927
MSE:                                        0.002324
R^2 score:                                  0.9918
Explained variance score:                   0.9918


### Multiple linear regression and regularisation

In [4]:
# Generate a data sample with multiple exogenous variables and use regularised multiple linear regeression

# Generate a sample
n = 500
X = pd.DataFrame(
    {
        "x1": np.random.random(n),
        "x2": np.random.random(n),
        "x3": np.random.random(n),
        "x4": np.random.random(n),
    }
)
b = [5, 10, 0.1, -1, -0.3]
y_noerror = b[0] + pd.Series(np.dot(X, b[1:]))
y = pd.Series(y_noerror + 0.1 * np.random.randn(n))

# Fit an unregularised linear model and get the parameters
model = linear_model.LinearRegression()
reg = model.fit(X, y)
y_pred = model.predict(X)
b_pred = [reg.intercept_] + list(reg.coef_)

# Calculate performance metrics
tss = np.dot(y - y.mean(), y - y.mean())
rss = np.dot(y_pred - y.mean(), y_pred - y.mean())
ess = tss - rss

mae = metrics.mean_absolute_error(y, y_pred)
mse = metrics.mean_squared_error(y, y_pred)
r2 = metrics.r2_score(y, y_pred)
exp_var = metrics.explained_variance_score(y, y_pred)

print("=" * 50)
print(f"Unregularised linear regression:")
print(f"Estimated beta_0 (intercept):               {b_pred[0]:0.4}")
for i in range(1, 5):
    print(f"Estimated beta_{i} (coefficient for x{i}):      {b_pred[i]:0.4}")
print(f"MSE:                                        {mse:0.4}")
print(f"R^2 score:                                  {r2:0.4}")

# ===================================================================================================
# Lasso (L1 regularised) regression
model = linear_model.Lasso(alpha=0.5)
reg = model.fit(X, y)
y_pred = model.predict(X)
b_pred = [reg.intercept_] + list(reg.coef_)

# Calculate performance metrics
tss = np.dot(y - y.mean(), y - y.mean())
rss = np.dot(y_pred - y.mean(), y_pred - y.mean())
ess = tss - rss

mae = metrics.mean_absolute_error(y, y_pred)
mse = metrics.mean_squared_error(y, y_pred)
r2 = metrics.r2_score(y, y_pred)
exp_var = metrics.explained_variance_score(y, y_pred)

print("=" * 50)
print(f"Lasso regression:")
print(f"Estimated beta_0 (intercept):               {b_pred[0]:0.4}")
for i in range(1, 5):
    print(f"Estimated beta_{i} (coefficient for x{i}):      {b_pred[i]:0.4}")
print(f"MSE:                                        {mse:0.4}")
print(f"R^2 score:                                  {r2:0.4}")

# ===================================================================================================
# Ridge (L2 regularised) regression
model = linear_model.Ridge(alpha=1)
reg = model.fit(X, y)
y_pred = model.predict(X)
b_pred = [reg.intercept_] + list(reg.coef_)

# Calculate performance metrics
tss = np.dot(y - y.mean(), y - y.mean())
rss = np.dot(y_pred - y.mean(), y_pred - y.mean())
ess = tss - rss

mae = metrics.mean_absolute_error(y, y_pred)
mse = metrics.mean_squared_error(y, y_pred)
r2 = metrics.r2_score(y, y_pred)
exp_var = metrics.explained_variance_score(y, y_pred)

print("=" * 50)
print(f"Lasso regression:")
print(f"Estimated beta_0 (intercept):               {b_pred[0]:0.4}")
for i in range(1, 5):
    print(f"Estimated beta_{i} (coefficient for x{i}):      {b_pred[i]:0.4}")
print(f"MSE:                                        {mse:0.4}")
print(f"R^2 score:                                  {r2:0.4}")

Unregularised linear regression:
Estimated beta_0 (intercept):               4.991
Estimated beta_1 (coefficient for x1):      9.985
Estimated beta_2 (coefficient for x2):      0.1065
Estimated beta_3 (coefficient for x3):      -0.9968
Estimated beta_4 (coefficient for x4):      -0.2812
MSE:                                        0.009357
R^2 score:                                  0.9989
Lasso regression:
Estimated beta_0 (intercept):               7.324
Estimated beta_1 (coefficient for x1):      4.122
Estimated beta_2 (coefficient for x2):      0.0
Estimated beta_3 (coefficient for x3):      -0.0
Estimated beta_4 (coefficient for x4):      -0.0
MSE:                                        3.079
R^2 score:                                  0.6431
Lasso regression:
Estimated beta_0 (intercept):               5.11
Estimated beta_1 (coefficient for x1):      9.753
Estimated beta_2 (coefficient for x2):      0.1036
Estimated beta_3 (coefficient for x3):      -0.9911
Estimated beta_4 (coeff

### Future work

   * Use the statsmodels library
   * Leverage
   * Scale-location plots
   * Outliers and Cook's distance
   * Feature selection
   * Multicollinearity and confounding variables
   * Generalised linear models