# Linear Regression

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from ipywidgets import interactive

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error

from sklearn.preprocessing import add_dummy_feature

import numpy.linalg as npla

In [None]:
rng = np.random.RandomState(2)

## Read in dataset

In [None]:
import os
if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/drive')
    base_dir = "./drive/My Drive/Colab Notebooks/" # You may need to change this, depending on where your notebooks are on Google Drive
else:
    base_dir = "."
dataset_dir = os.path.join(base_dir, "datasets")

In [None]:
df = pd.read_csv(os.path.join(dataset_dir, "housing.csv"))

## Split into training set and test set

In [None]:
train, test = train_test_split(df, test_size=0.2, random_state=rng)

In [None]:
features = ["BasementArea", "GroundFloorArea", "Bedrooms", "Condition"]

X_train = train[features]
y_train = train["SalePrice"]
X_test = test[features]
y_test = test["SalePrice"]

## Linear Regression using Scikit-Learn

Scikit-Learn's LinearRegression class is trained using the Normal Equation (see slides). The Normal Equation has no hyperparameters, so we don't need any model selection (validation sets, grid search), and it is scale-invariant, so we don't need a preprocessor for scaling. (Of course, on other datasets, you might have a preprocessor to do other things, e.g. to convert nominal-valued features.)

In [None]:
linear_model = LinearRegression()

In [None]:
linear_model.fit(X_train, y_train)

In [None]:
linear_model.intercept_, linear_model.coef_

In [None]:
mean_absolute_error(linear_model.predict(X_test), y_test)

## Interpretability

The model we have learned is 
$$\hat{y} = 4431 + 62 \times \mathit{BasementArea} + 103 \times \mathit{GroundFloorArea} + -20717 \times \mathit{Bedrooms} + 2639 \times \mathit{Condition}$$

- Do you find that to be interpretable?
- Should we condlude that $\mathit{Condition}$ is the most important feature? Should we conclude that $\mathit{Bedrooms}$ is an unhelpful feature?

In [None]:
linear_model_scaled = Pipeline([
    ("scaler", StandardScaler()),
    ("predictor", LinearRegression())
])

In [None]:
linear_model_scaled.fit(X_train, y_train)

In [None]:
linear_model_scaled.named_steps["predictor"].intercept_, linear_model_scaled.named_steps["predictor"].coef_

This time the model we have learned is 
$$\hat{y} = 179966 + 27717 \times \mathit{BasementArea (scaled)} + 51958 \times \mathit{GroundFloorArea (scaled)} + -16856 \times \mathit{Bedrooms (scaled)} + 2979 \times \mathit{Condition (scaled)}$$
- Now, $\mathit{GroundFloorArea}$ is the most important feature!

In [None]:
linear_model_scaled.fit(X_train[["BasementArea", "Bedrooms", "Condition"]], y_train)

In [None]:
linear_model_scaled.named_steps["predictor"].intercept_, linear_model_scaled.named_steps["predictor"].coef_

Without the $\mathit{GroundFloorArea}$, the model we have learned is 
$$\hat{y} = 179966 + 50050 \times \mathit{BasementArea (scaled)} + 8539 \times \mathit{Bedrooms (scaled)} + 1191 \times \mathit{Condition (scaled)}$$
- The coefficient for $\mathit{Bedrooms}$ is no longer negative.
- Why will coefficients sometimes be negative?

In [None]:
sns.heatmap(X_train.corr(numeric_only=True), annot=True)
plt.show()

The rest of this Jupyter Notebook is devoted to trying to help explain what's going on 'under the bonnet'.

## Linear Regression - Under the Bonnet

### Linear Regression with one feature

In [None]:
plot = sns.scatterplot(data=train, x="GroundFloorArea", y="SalePrice")

In [None]:
def J(X, y, b, w):
    return np.mean((b + X.dot(w) - y) ** 2) / 2.0
    
def show_linear_model(b, w):
    fig, ax = plt.subplots()
    ax.set_xlim(0, 6000)
    ax.set_ylim(0, 700000)
    sns.scatterplot(data=train, x="GroundFloorArea", y="SalePrice", ax=ax)
    xvals = np.array([0,6000])
    sns.lineplot(x=xvals, y=b + w*xvals, color='g', ax=ax)
    ax.text(3400, 650000, "Loss: " + str(J(X_train[["GroundFloorArea"]], y_train, b, np.array([w]))))

interactive_plot = interactive(show_linear_model, b=(0,700000, 5000), w=(-1000,1000,10))
interactive_plot

### Linear Regression with two features

In [None]:
def J(X, y, b, w):
    return np.mean((b + X.dot(w) - y) ** 2) / 2.0
    
def show_linear_model(b, w1, w2):
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.set_xlabel("GroundFloorArea")
    ax.set_xlim(0, 6000)
    ax.set_ylabel("BasementArea")
    ax.set_ylim(0, 6000)
    ax.set_zlabel("SalePrice")
    ax.set_zlim(0, 700000)
    ax.scatter(train["GroundFloorArea"],  train["BasementArea"], train["SalePrice"], color="green")
    xvals = np.linspace(0, 6000, 2)
    yvals = np.linspace(0, 6000, 2)
    xxvals, yyvals = np.meshgrid(xvals, yvals)
    ax.plot_surface(xxvals, yyvals, b + w1*xxvals + w2*yyvals, color=(0, 0, 1, 0.2))
    ax.text(200, 0, 1000000, "Loss: " + str(J(X_train[["GroundFloorArea", "BasementArea"]], y_train, b, np.array([w1, w2]))))


interactive_plot = interactive(show_linear_model, b=(0, 700000, 5000), w1=(-1000,1000,10), w2=(-1000,1000,10))
interactive_plot

### MSE is a Convex Loss Function

In [None]:
X_train_scaled = StandardScaler().fit_transform(X_train[["GroundFloorArea", "BasementArea"]])
fig = plt.figure() 
plt.xlabel("w1")
plt.ylabel("w2")
xvals = np.linspace(-100000000, 100000000, 100)
yvals = np.linspace(-100000000, 100000000, 100)
xxvals, yyvals = np.meshgrid(xvals, yvals)
zs = np.array([J(X_train_scaled, y_train, 0, np.array([w1, w2]))
                 for w1, w2 in zip(xxvals.flatten(), yyvals.flatten())])
zvals = zs.reshape(xxvals.shape)
C = plt.contour(xxvals, yyvals, zvals, 15, colors = "black")
plt.clabel(C, inline=1, fontsize=10)
plt.show()

(The above plot assumed b=0 so that we could use a 2D diagram, and it shows the values of w1 and w2 for scaled features, not the original feature values.)

### Let's "roll our own" linear regressor using the Normal Equation - unnecessary - but informative

In [None]:
class OurLinearRegressor_v1():
    
    def fit(self, X, y):
        X = add_dummy_feature(X)
        self.param_vals = npla.inv(X.T.dot(X)).dot(X.T).dot(y)
    
    def predict(self, X):
        X = add_dummy_feature(X)
        return X.dot(self.param_vals)

In [None]:
linear_model = OurLinearRegressor_v1()

In [None]:
linear_model.fit(X_train, y_train)

In [None]:
linear_model.param_vals

In [None]:
mean_absolute_error(linear_model.predict(X_test), y_test)

But there's a problem. The normal equation requires that `X_train` has an inverse. But it might not. There is something called the pseudo-inverse which we can often use instead. So here's a more robust way of writing this class.

In [None]:
class OurLinearRegressor_v2():
    
    def fit(self, X, y):
        X = add_dummy_feature(X)
        self.param_vals = npla.inv(X.T.dot(X)).dot(X.T).dot(y)
    
    def predict(self, X):
        X = add_dummy_feature(X)
        return X.dot(self.param_vals)

In [None]:
linear_model = OurLinearRegressor_v2()

In [None]:
linear_model.fit(X_train, y_train)

In [None]:
linear_model.param_vals

In [None]:
mean_absolute_error(linear_model.predict(X_test), y_test)