In [None]:
# graphical libraries
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 120
from IPython.display import Image
from IPython.display import display
plt.style.use('seaborn-white')

In [None]:
!pip install --upgrade --q scipy

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.5/34.5 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# computational libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.decomposition import PCA
from scipy.spatial import Delaunay
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
import scipy.stats as stats
from sklearn.model_selection import train_test_split as tts, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error as mse
from scipy.interpolate import interp1d, griddata, LinearNDInterpolator, NearestNDInterpolator
from math import ceil
from scipy import linalg
# the following line(s) are necessary if you want to make SKlearn compliant functions
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

In [None]:
lm = LinearRegression()
scale = StandardScaler()
qscale = QuantileTransformer()

In [None]:
# approach by Alex Gramfort --> only for 1-dimensional inputs

# https://gist.github.com/agramfort/850437

"""William S. Cleveland: "Robust locally weighted regression and smoothing
scatterplots", Journal of the American Statistical Association, December 1979,
volume 74, number 368, pp. 829-836.

William S. Cleveland and Susan J. Devlin: "Locally weighted regression: An
approach to regression analysis by local fitting", Journal of the American
Statistical Association, September 1988, volume 83, number 403, pp. 596-610."""

def lowess_ag(x, y, f=2. / 3., iter=3):
    """lowess(x, y, f=2./3., iter=3) -> yest
    Lowess smoother: Robust locally weighted regression.
    The lowess function fits a nonparametric regression curve to a scatterplot.
    The arrays x and y contain an equal number of elements; each pair
    (x[i], y[i]) defines a data point in the scatterplot. The function returns
    the estimated (smooth) values of y.
    The smoothing span is given by f. A larger value for f will result in a
    smoother curve. The number of robustifying iterations is given by iter. The
    function will run faster with a smaller number of iterations.
    """
    n = len(x)
    r = int(ceil(f * n))
    h = [np.sort(np.abs(x - x[i]))[r] for i in range(n)]
    w = np.clip(np.abs((x[:, None] - x[None, :]) / h), 0.0, 1.0)
    w = (1 - w ** 3) ** 3
    yest = np.zeros(n)
    delta = np.ones(n)
    for iteration in range(iter):
        for i in range(n):
            weights = delta * w[:, i]
            b = np.array([np.sum(weights * y), np.sum(weights * y * x)])
            A = np.array([[np.sum(weights), np.sum(weights * x)],
                          [np.sum(weights * x), np.sum(weights * x * x)]])
            beta = linalg.solve(A, b)
            yest[i] = beta[0] + beta[1] * x[i]

        residuals = y - yest
        s = np.median(np.abs(residuals))
        delta = np.clip(residuals / (6.0 * s), -1, 1)
        delta = (1 - delta ** 2) ** 2

    return yest

In [None]:
data = pd.read_csv('drive/MyDrive/Data Sets/cars.csv')

In [None]:
x = data.loc[:,'CYL':'WGT'].values
y = data['MPG'].values

In [None]:
data = pd.read_csv('drive/MyDrive/Data Sets/concrete.csv')
data

Unnamed: 0,cement,slag,ash,water,superplastic,coarseagg,fineagg,age,strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.30
...,...,...,...,...,...,...,...,...,...
1025,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28,44.28
1026,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28,31.18
1027,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28,23.70
1028,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28,32.77


In [None]:
x = data.loc[:,'cement':'age'].values
y = data['strength'].values

In [None]:
scale = StandardScaler()
x = scale.fit_transform(x)

In [None]:
# here we have a function that computes the Euclidean distance between all the observations in u, and v
def dist(u,v):
  if len(v.shape)==1:
    v = v.reshape(1,-1)
  d = np.array([np.sqrt(np.sum((u-v[i])**2,axis=1)) for i in range(len(v))])
  return d

In [None]:
dist(x,x[0])

array([[0.        , 0.19300995, 5.83800882, ..., 5.22174043, 4.79245849,
        4.5329903 ]])

In [None]:
n = len(x)

In [None]:
h = [np.sort(np.sqrt(np.sum((x-x[i])**2,axis=1)))[30] for i in range(n)]

Here "axis=1" means that each observation is a row vector whose columns are the feature values. So the distance between $\vec{v}=[v_1,v_2,v_3]$ and a vector $u=[u_1,u_2,u_3]$ is:

$$\text{dist}(\vec{v},\vec{u}):=\sqrt{(v_1-u_1)^2+(v_2-u_2)^2+(v_3-u_3)^2}$$

In [None]:
w = np.clip(dist(x,x) / h, 0.0, 1.0)

In [None]:
w = (1 - w ** 3) ** 3

In [None]:
w.shape

(1030, 1030)

In [None]:
iter = 3
n = len(x)

In [None]:
X1 = np.column_stack([np.ones((len(x),1)),x])

In [None]:
lm = LinearRegression()

In [None]:
yest = np.zeros(n)
delta = np.ones(n)
for iteration in range(iter):
    for i in range(n):
        W = np.diag(delta*w[i,:])
        b = np.transpose(X1).dot(W).dot(y)
        A = np.transpose(X1).dot(W).dot(X1)
        #
        A = A + 0.0001*np.eye(X1.shape[1]) # if we want L2 regularization
        beta = linalg.solve(A, b) # A*theta = b
        #beta, res, rnk, s = linalg.lstsq(A, b)
        yest[i] = np.dot(X1[i],beta)
        # lm.fit(A,b)
        # yest[i] = lm.predict(X1[i].reshape(1,-1))

    residuals = y - yest
    s = np.median(np.abs(residuals))
    delta = np.clip(residuals / (6.0 * s), -1, 1)
    delta = (1 - delta ** 2) ** 2

In [None]:
mse(y,yest)

16.23843389161822

In [None]:
pca = PCA(n_components=3)

array([79.99      , 61.89      , 40.27      , ..., 23.69999921,
       32.76999336, 32.40000229])

In [None]:
x = np.linspace(0, 1, 5)
y = np.linspace(0, 2, 6)
z = np.linspace(0, 3, 7)

In [None]:
q = (x,y,z)

0.25

In [None]:
from scipy.spatial import Delaunay

In [None]:
tri = Delaunay(x,qhull_options='QJ Pp')

In [None]:
f = LinearNDInterpolator(tri,y[:10])

In [None]:
def lw_ag_md(x, y, xnew,f=2/3,iter=3, intercept=True):

  n = len(x)
  r = int(ceil(f * n))
  yest = np.zeros(n)

  if len(y.shape)==1: # here we make column vectors
    y = y.reshape(-1,1)

  if len(x.shape)==1:
    x = x.reshape(-1,1)

  if intercept:
    x1 = np.column_stack([np.ones((len(x),1)),x])
  else:
    x1 = x

  h = [np.sort(np.sqrt(np.sum((x-x[i])**2,axis=1)))[r] for i in range(n)]
  # dist(x,x) is always symmetric
  w = np.clip(dist(x,x) / h, 0.0, 1.0)
  w = (1 - w ** 3) ** 3

  #Looping through all X-points
  delta = np.ones(n)
  for iteration in range(iter):
    for i in range(n):
      W = np.diag(delta).dot(np.diag(w[i,:]))
      # when we multiply two diagional matrices we get also a diagonal matrix
      b = np.transpose(x1).dot(W).dot(y)
      A = np.transpose(x1).dot(W).dot(x1)
      ##
      A = A + 0.0001*np.eye(x1.shape[1]) # if we want L2 regularization
      beta = linalg.solve(A, b)
      #beta, res, rnk, s = linalg.lstsq(A, b)
      yest[i] = np.dot(x1[i],beta)

    residuals = y.ravel() - yest
    s = np.median(np.abs(residuals))

    delta = np.clip(residuals / (6.0 * s), -1, 1)

    delta = (1 - delta ** 2) ** 2


  if x.shape[1]==1:
    f = interp1d(x.flatten(),yest,fill_value='extrapolate')
    output = f(xnew)
  else:
    output = np.zeros(len(xnew))
    for i in range(len(xnew)):
      ind = np.argsort(np.sqrt(np.sum((x-xnew[i])**2,axis=1)))[:r]
      pca = PCA(n_components=2)
      x_pca = pca.fit_transform(x[ind])
      tri = Delaunay(x_pca,qhull_options='QJ')
      f = LinearNDInterpolator(tri,yest[ind])
      output[i] = f(pca.transform(xnew[i].reshape(1,-1))) # the output may have NaN's where the data points from xnew are outside the convex hull of X
  if sum(np.isnan(output))>0:
    g = NearestNDInterpolator(x,y.ravel())
    # output[np.isnan(output)] = g(X[np.isnan(output)])
    output[np.isnan(output)] = g(xnew[np.isnan(output)])
  return output

In [None]:
xtrain, xtest, ytrain, ytest = tts(x,y,test_size=0.3,shuffle=True,random_state=123)

In [None]:
xtrain = scale.fit_transform(xtrain)
xtest = scale.transform(xtest)

In [None]:
yhat = lw_ag_md(xtrain,ytrain,xtest,f=1/3,iter=3,intercept=True)

In [None]:
mse(ytest,yhat)

17.276385671031164

## Scikit-Learn Compliant Functions

In [None]:
class Lowess_AG_MD:
    def __init__(self, f = 1/10, iter = 3,intercept=True):
        self.f = f
        self.iter = iter
        self.intercept = intercept

    def fit(self, x, y):
        f = self.f
        iter = self.iter
        self.xtrain_ = x
        self.yhat_ = y

    def predict(self, x_new):
        check_is_fitted(self)
        x = self.xtrain_
        y = self.yhat_
        f = self.f
        iter = self.iter
        intercept = self.intercept
        return lw_ag_md(x, y, x_new, f, iter, intercept) # this is actually our defined function of Lowess

    def get_params(self, deep=True):
    # suppose this estimator has parameters "f", "iter" and "intercept"
        return {"f": self.f, "iter": self.iter,"intercept":self.intercept}

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

In [None]:
model = Lowess_AG_MD(f=1/5,iter=3,intercept=True)
model.fit(xtrain,ytrain)
yhat = model.predict(xtest)
mse(ytest,yhat)

19.49626202008861

## "Cars" data

In [None]:
x = data.loc[:,'CYL':'WGT'].values
y = data['MPG'].values

## KFold Cross-Validations

In [None]:
mse_lwr = []
mse_rf = []
kf = KFold(n_splits=10,shuffle=True,random_state=1234)
model_rf = RandomForestRegressor(n_estimators=200,max_depth=5)
model_lw = Lowess_AG_MD(f=1/3,iter=2,intercept=True)

for idxtrain, idxtest in kf.split(x):
  xtrain = x[idxtrain]
  ytrain = y[idxtrain]
  ytest = y[idxtest]
  xtest = x[idxtest]
  xtrain = scale.fit_transform(xtrain)
  xtest = scale.transform(xtest)

  model_lw.fit(xtrain,ytrain)
  yhat_lw = model_lw.predict(xtest)

  model_rf.fit(xtrain,ytrain)
  yhat_rf = model_rf.predict(xtest)

  mse_lwr.append(mse(ytest,yhat_lw))
  mse_rf.append(mse(ytest,yhat_rf))
print('The Cross-validated Mean Squared Error for Locally Weighted Regression is : '+str(np.mean(mse_lwr)))
print('The Cross-validated Mean Squared Error for Random Forest is : '+str(np.mean(mse_rf)))

The Cross-validated Mean Squared Error for Locally Weighted Regression is : 16.530161925219694
The Cross-validated Mean Squared Error for Random Forest is : 17.27099378296926


## Grid Search CV

This is a quite slow gridsearch optimization.

In [None]:
lwr_pipe = Pipeline([('zscores', StandardScaler()),
                     ('lwr', Lowess_AG_MD())])

In [None]:
# here we have a subtle aspect: the local name of the predictor in the Pipeline: "lwr"
# to call the hyperparameters you need the local name and two "undescrore" symbols and then the name of the aprameter
params = [{'lwr__f': [1/i for i in range(3,15)],
         'lwr__iter': [1,2,3,4]}]

In [None]:
gs_lowess = GridSearchCV(lwr_pipe,
                      param_grid=params,
                      scoring='neg_mean_squared_error',
                      cv=5)
gs_lowess.fit(x, y)
gs_lowess.best_params_

{'lwr__f': 0.3333333333333333, 'lwr__iter': 2}

In [None]:
gs_lowess.score(x,y)

-14.882608801484357