# Regression

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pandas.plotting import scatter_matrix

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import mean_absolute_error

from sklearn.tree import plot_tree

In [None]:
rng = np.random.RandomState(2)

## Read in dataset

In [None]:
import os
if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/drive')
    base_dir = "./drive/My Drive/Colab Notebooks/" # You may need to change this, depending on where your notebooks are on Google Drive
else:
    base_dir = "."
dataset_dir = os.path.join(base_dir, "datasets")

In [None]:
df = pd.read_csv(os.path.join(dataset_dir, "housing.csv"))

## Take a cheeky look

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe(include="all")

## Split into training set and test set

In [None]:
train, test = train_test_split(df, test_size=0.2, random_state=rng)

In [None]:
features = ["BasementArea", "GroundFloorArea", "Bedrooms", "Condition"]

X_train = train[features]
y_train = train["SalePrice"]
X_test = test[features]
y_test = test["SalePrice"]

## Exploratory Data Analysis (EDA)

In [None]:
scatter_matrix(train, figsize=(10, 10))
plt.show()

In [None]:
sns.heatmap(train.corr(numeric_only=True), annot=True)
plt.show()

## We can use a Decision Tree - a Decision Tree Regressor

In [None]:
decision_tree = DecisionTreeRegressor(max_depth=2, random_state=rng)

In [None]:
decision_tree.fit(X_train, y_train)

In [None]:
fig = plt.figure(figsize=(20,8))
plot_tree(decision_tree, feature_names=features, fontsize=12)
plt.show()

In [None]:
decision_tree = DecisionTreeRegressor(max_depth=9, random_state=rng)

In [None]:
decision_tree.fit(X_train, y_train)

In [None]:
mean_absolute_error(decision_tree.predict(X_test), y_test)

## k-Nearest-Neighbours Regressor - try it first - explain afterwards

In [None]:
knn = Pipeline([
    ("scaler", StandardScaler()),
    ("predictor", KNeighborsRegressor(n_neighbors = 3))
])

In [None]:
knn.fit(X_train, y_train)

In [None]:
mean_absolute_error(knn.predict(X_test), y_test)

In [None]:
knn = Pipeline([
    ("scaler", StandardScaler()),
    ("predictor", KNeighborsRegressor(n_neighbors = 9))
])

In [None]:
knn.fit(X_train, y_train)

In [None]:
mean_absolute_error(knn.predict(X_test), y_test)

## Similarity - in fact, distance - in fact, Euclidean distance

In [None]:
def euc(x, xprime):
    return np.sqrt(np.sum((x - xprime)**2))

In [None]:
# Example

# Your house
#    GrounfFloorArea 1652.0
#    BasementArae     782.0
#    Bedrooms           3.0
#    Condition          5.0

your_house = np.array([1652.0, 782.0, 3.0, 5.0])

# My house
#    GroundFloorArea  1518.0
#    BasementArea     1518.0
#    Bedrooms            1.0
#    Condition           5.0

my_house = np.array([1518.0, 1518.0, 1.0, 5.0])

print( euc(your_house, my_house) )

## Let's "roll our own" 1NN regressor - unnecessary - but informative

In [None]:
dists = [euc(your_house, x) for x in X_train.values]

In [None]:
# Just to show you, here are the first 3 distances
dists[:3]

In [None]:
# Even better, we can, with one line of code, find the most similar house
np.min([euc(your_house, x) for x in X_train.values])

In [None]:
# Even better again, we can find which house is the most similar
np.argmin([euc(your_house, x) for x in X_train.values])

In [None]:
# Even better, we can display the most similar house
X_train.iloc[np.argmin([euc(your_house, x) for x in X_train.values])]

In [None]:
# Best of all, we can display the SalePrice of the most similar house
y_train.iloc[np.argmin([euc(your_house, x) for x in X_train.values])]

In [None]:
# Put it all together

def nn_regressor(your_house):
    # Find which example in X_train is the most similar to your_house
    index = np.argmin([euc(your_house, x) for x in X_train.values])
    # Return the corresponding target value from y
    return y_train.iloc[index]

In [None]:
print( nn_regressor(your_house) )

## And our own kNN regressor

In [None]:
def knn_regressor(your_house, k):
    # Find which k examples in X are the most similar to x
    indexes = np.argsort([euc(your_house, x) for x in X_train.values])[:k]
    # Return the mean of the corresponding target values from y
    return y_train.iloc[indexes].mean()

In [None]:
print( knn_regressor(your_house, k=3) )

## An object-oriented version - closer to the one in scikit-learn

In [None]:
class Our_kNN():

    def __init__(self, k=3):
        self.k = k
        
    def fit(self, X, y):
        self.X = X
        self.y = y
    
    def predict(self, q):
        indexes = np.argsort([euc(q, x) for x in self.X])[:self.k]
        return self.y.iloc[indexes].mean()

In [None]:
our_knn = Our_kNN(k = 3)

our_knn.fit(X_train.values, y_train)

our_knn.predict(your_house)

<p>This version emphasises how little work kNN does during <code>fit</code>. All the real work happens at inference time: when we call <code>predict</code>.</p>
<p>By the way, this version is not identical to the one in scikit-learn because the one in scikit-learn can do multiple predictions at once, whereas our version makes just one prediction at a time.</p>

## But we should scale the feature-values - compare with and without

In [None]:
knn_without_scaling = KNeighborsRegressor(n_neighbors = 3)

In [None]:
knn_without_scaling.fit(X_train, y_train)

In [None]:
mean_absolute_error(knn_without_scaling.predict(X_test), y_test)

In [None]:
knn_with_scaling = Pipeline([
    ("scaler", StandardScaler()),
    ("predictor", KNeighborsRegressor(n_neighbors = 3))
])

In [None]:
knn_with_scaling.fit(X_train, y_train)

In [None]:
mean_absolute_error(knn_with_scaling.predict(X_test), y_test)

## Decison Trees, by the way, are scale-invariant - no need to scale - although it does no harm

In [None]:
decision_tree_without_scaling = DecisionTreeRegressor(max_depth = 3, random_state=rng)

In [None]:
decision_tree_without_scaling.fit(X_train, y_train)

In [None]:
mean_absolute_error(decision_tree_without_scaling.predict(X_test), y_test)

In [None]:
decision_tree_with_scaling = Pipeline([
    ("scaler", StandardScaler()),
    ("predictor", DecisionTreeRegressor(max_depth = 3, random_state=rng))
])

In [None]:
decision_tree_with_scaling.fit(X_train, y_train)

In [None]:
mean_absolute_error(decision_tree_with_scaling.predict(X_test), y_test)

## BTW 1NN illustrates why we should use test error - not training error - for error estimation

In [None]:
knn = Pipeline([
    ("scaler", StandardScaler()),
    ("predictor", KNeighborsRegressor(n_neighbors = 1))
])

In [None]:
knn.fit(X_train, y_train)

In [None]:
mean_absolute_error(knn.predict(X_test), y_test)

In [None]:
mean_absolute_error(knn.predict(X_train), y_train)